diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml new file mode 100644 index 00000000000..ba5fcf8590a --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -0,0 +1,389 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx1200 +- gfx1200 +- [Device 73f0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAKgPe_Wv9DOQxiTMf0uqv29biFXQUfeD7e0IgqktbPPA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6656 + LdsInitCVgprs: false + LdsNumBytes: 6656 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [128, 128, 1, 128, 128, 128, 128, 128] + - [0, 0.06] +- null +- null +- DeviceEfficiency +- Equality diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml new file mode 100644 index 00000000000..9b50fadbda3 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -0,0 +1,389 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx1200 +- gfx1200 +- [Device 73f0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAfFwDhrLGpf_fJnNGMQ8jmNjRh0cirDpbLofAguxu_uo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 2432 + LdsInitCVgprs: false + LdsNumBytes: 2432 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 5248 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2432 + LdsOffsetMetadata_Blk: 5248 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [128, 128, 1, 128, 128, 128, 128, 128] + - [0, 0.05] +- null +- null +- DeviceEfficiency +- Equality diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml new file mode 100644 index 00000000000..dfef89b7a94 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -0,0 +1,389 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx1200 +- gfx1200 +- [Device 73f0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAoFez-_kQ-PuatlsN5JlmSHh4T79Tn3BVV663YZB3q9U= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 4 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6400 + LdsInitCVgprs: false + LdsNumBytes: 6400 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 5248 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1152 + LdsOffsetMetadata_Blk: 5248 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [128, 128, 1, 128, 128, 128, 128, 128] + - [0, 0.06] +- null +- null +- DeviceEfficiency +- Equality diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml new file mode 100644 index 00000000000..7ad26d4f733 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -0,0 +1,389 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx1200 +- gfx1200 +- [Device 73f0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserASXY4cG5lfvIhZiZwG5Ph-9MbindmMWVof0XwyUcbwv4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 4 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6656 + LdsInitCVgprs: false + LdsNumBytes: 6656 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [128, 128, 1, 128, 128, 128, 128, 128] + - [0, 0.06] +- null +- null +- DeviceEfficiency +- Equality diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml deleted file mode 100644 index 81e14ddb3e0..00000000000 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml +++ /dev/null @@ -1,7014 +0,0 @@ -- {MinimumRequiredVersion: 4.33.0} -- gfx1200 -- gfx1200 -- [Device 73f0] -- Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false -- - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR0_LBSPPA1024_LBSPPB256_MIWT1_1_PGR1_PLR0_SS0_SVW8_WG64_2_1 - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13568 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 0 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA1024_LBSPPB256_MIWT1_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_WG64_2_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [64, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR1_SS0_SVW8_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR1_SS0_SVW8_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR0_SS0_SVW8_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR0_SS0_SVW8_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR0_SS0_SU32_SUM0_SUS256_SVW8_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS0_SVW8_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR2_PLR1_SS0_SVW8_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS0_SVW8_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 7 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_LBSPPA512_LBSPPB1024_MIWT2_1_PGR2_PLR0_SS0_SVW8_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 8 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR2_PLR0_SS0_SU32_SUM0_SUS256_SVW8_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB512_MIWT1_1_PGR1_PLR1_SS0_SVW8_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 9 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB512_MIWT1_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR3_SS0_SVW8_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29184 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 10 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR3_SS0_SVW8_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29184 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 11 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB256_MIWT1_1_PGR2_PLR1_SS1_SVW1_WG64_2_1 - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13568 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 12 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB256_MIWT1_1_PGR2_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG64_2_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [64, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR0_LBSPPA1024_LBSPPB256_MIWT1_1_PGR2_PLR0_SS1_SVW1_WG64_2_1 - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13568 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 13 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA1024_LBSPPB256_MIWT1_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_WG64_2_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [64, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SVW1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12544 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 14 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_LBSPPA512_LBSPPB512_MIWT1_1_PGR2_PLR0_SS1_SVW1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12544 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 15 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA512_LBSPPB512_MIWT1_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SVW1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 16 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR1_SS1_SVW1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 17 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR0_SS1_SVW1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 18 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS1_SVW1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 19 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS1_SVW1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 20 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SVW1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 21 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SVW1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29184 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 22 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR3_SS1_SVW1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29184 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 23 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR3_SS1_SVW1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29184 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 24 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 -- [2, 3, 0, 1] -- - - [512, 512, 1, 64, 512, 512, 512, 512] - - [18, 1099.89] - - - [1024, 512, 1, 64, 1024, 1024, 1024, 512] - - [6, 2144.12] - - - [1536, 512, 1, 64, 1536, 1536, 1536, 512] - - [7, 2764.79] - - - [2048, 512, 1, 64, 2048, 2048, 2048, 512] - - [19, 3519.82] - - - [2560, 512, 1, 64, 2560, 2560, 2560, 512] - - [8, 4377.96] - - - [3072, 512, 1, 64, 3072, 3072, 3072, 512] - - [6, 5057.82] - - - [512, 1024, 1, 64, 512, 512, 512, 1024] - - [7, 2087.37] - - - [1024, 1024, 1, 64, 1024, 1024, 1024, 1024] - - [6, 3734.7] - - - [1536, 1024, 1, 64, 1536, 1536, 1536, 1024] - - [5, 4845.64] - - - [2048, 1024, 1, 64, 2048, 2048, 2048, 1024] - - [8, 6096.37] - - - [2560, 1024, 1, 64, 2560, 2560, 2560, 1024] - - [5, 7148.67] - - - [3072, 1024, 1, 64, 3072, 3072, 3072, 1024] - - [5, 7922.19] - - - [512, 1536, 1, 64, 512, 512, 512, 1536] - - [5, 2895.54] - - - [1024, 1536, 1, 64, 1024, 1024, 1024, 1536] - - [24, 4027.82] - - - [1536, 1536, 1, 64, 1536, 1536, 1536, 1536] - - [5, 6702.84] - - - [2048, 1536, 1, 64, 2048, 2048, 2048, 1536] - - [2, 7639.17] - - - [2560, 1536, 1, 64, 2560, 2560, 2560, 1536] - - [8, 9019.2] - - - [3072, 1536, 1, 64, 3072, 3072, 3072, 1536] - - [6, 9848.67] - - - [512, 2048, 1, 64, 512, 512, 512, 2048] - - [5, 3739.91] - - - [1024, 2048, 1, 64, 1024, 1024, 1024, 2048] - - [8, 6187.86] - - - [1536, 2048, 1, 64, 1536, 1536, 1536, 2048] - - [6, 7769.18] - - - [2048, 2048, 1, 64, 2048, 2048, 2048, 2048] - - [5, 9177.28] - - - [2560, 2048, 1, 64, 2560, 2560, 2560, 2048] - - [3, 10608.8] - - - [3072, 2048, 1, 64, 3072, 3072, 3072, 2048] - - [5, 12018.4] - - - [512, 2560, 1, 64, 512, 512, 512, 2560] - - [6, 4229.73] - - - [1024, 2560, 1, 64, 1024, 1024, 1024, 2560] - - [5, 7224.09] - - - [1536, 2560, 1, 64, 1536, 1536, 1536, 2560] - - [6, 8998.24] - - - [2048, 2560, 1, 64, 2048, 2048, 2048, 2560] - - [2, 10791.5] - - - [2560, 2560, 1, 64, 2560, 2560, 2560, 2560] - - [6, 12205.3] - - - [3072, 2560, 1, 64, 3072, 3072, 3072, 2560] - - [4, 12526.4] - - - [512, 3072, 1, 64, 512, 512, 512, 3072] - - [22, 3995.76] - - - [1024, 3072, 1, 64, 1024, 1024, 1024, 3072] - - [8, 7679.97] - - - [1536, 3072, 1, 64, 1536, 1536, 1536, 3072] - - [2, 10270.0] - - - [2048, 3072, 1, 64, 2048, 2048, 2048, 3072] - - [5, 11815.3] - - - [2560, 3072, 1, 64, 2560, 2560, 2560, 3072] - - [6, 12965.9] - - - [3072, 3072, 1, 64, 3072, 3072, 3072, 3072] - - [8, 13879.2] - - - [512, 512, 1, 256, 512, 512, 512, 512] - - [10, 3903.49] - - - [1024, 512, 1, 256, 1024, 1024, 1024, 512] - - [7, 6716.76] - - - [1536, 512, 1, 256, 1536, 1536, 1536, 512] - - [7, 8778.71] - - - [2048, 512, 1, 256, 2048, 2048, 2048, 512] - - [5, 10953.9] - - - [2560, 512, 1, 256, 2560, 2560, 2560, 512] - - [19, 12527.3] - - - [3072, 512, 1, 256, 3072, 3072, 3072, 512] - - [1, 9610.21] - - - [512, 1024, 1, 256, 512, 512, 512, 1024] - - [7, 6923.26] - - - [1024, 1024, 1, 256, 1024, 1024, 1024, 1024] - - [5, 11248.1] - - - [1536, 1024, 1, 256, 1536, 1536, 1536, 1024] - - [5, 14206.2] - - - [2048, 1024, 1, 256, 2048, 2048, 2048, 1024] - - [5, 16510.7] - - - [2560, 1024, 1, 256, 2560, 2560, 2560, 1024] - - [5, 18361.3] - - - [3072, 1024, 1, 256, 3072, 3072, 3072, 1024] - - [16, 19686.5] - - - [512, 1536, 1, 256, 512, 512, 512, 1536] - - [5, 8780.82] - - - [1024, 1536, 1, 256, 1024, 1024, 1024, 1536] - - [5, 14508.7] - - - [1536, 1536, 1, 256, 1536, 1536, 1536, 1536] - - [16, 16967.2] - - - [2048, 1536, 1, 256, 2048, 2048, 2048, 1536] - - [5, 19927.9] - - - [2560, 1536, 1, 256, 2560, 2560, 2560, 1536] - - [5, 21248.4] - - - [3072, 1536, 1, 256, 3072, 3072, 3072, 1536] - - [5, 22807.0] - - - [512, 2048, 1, 256, 512, 512, 512, 2048] - - [17, 7492.55] - - - [1024, 2048, 1, 256, 1024, 1024, 1024, 2048] - - [5, 16369.8] - - - [1536, 2048, 1, 256, 1536, 1536, 1536, 2048] - - [1, 19975.1] - - - [2048, 2048, 1, 256, 2048, 2048, 2048, 2048] - - [16, 21887.6] - - - [2560, 2048, 1, 256, 2560, 2560, 2560, 2048] - - [5, 23864.7] - - - [3072, 2048, 1, 256, 3072, 3072, 3072, 2048] - - [5, 25187.5] - - - [512, 2560, 1, 256, 512, 512, 512, 2560] - - [5, 12600.2] - - - [1024, 2560, 1, 256, 1024, 1024, 1024, 2560] - - [19, 18341.3] - - - [1536, 2560, 1, 256, 1536, 1536, 1536, 2560] - - [5, 21642.2] - - - [2048, 2560, 1, 256, 2048, 2048, 2048, 2560] - - [5, 23746.5] - - - [2560, 2560, 1, 256, 2560, 2560, 2560, 2560] - - [5, 25374.1] - - - [3072, 2560, 1, 256, 3072, 3072, 3072, 2560] - - [5, 26900.8] - - - [512, 3072, 1, 256, 512, 512, 512, 3072] - - [16, 14040.0] - - - [1024, 3072, 1, 256, 1024, 1024, 1024, 3072] - - [5, 20017.3] - - - [1536, 3072, 1, 256, 1536, 1536, 1536, 3072] - - [5, 22886.9] - - - [2048, 3072, 1, 256, 2048, 2048, 2048, 3072] - - [5, 25132.2] - - - [2560, 3072, 1, 256, 2560, 2560, 2560, 3072] - - [1, 26647.4] - - - [3072, 3072, 1, 256, 3072, 3072, 3072, 3072] - - [19, 27327.5] - - - [512, 512, 1, 512, 512, 512, 512, 512] - - [23, 6688.31] - - - [1024, 512, 1, 512, 1024, 1024, 1024, 512] - - [16, 11672.1] - - - [1536, 512, 1, 512, 1536, 1536, 1536, 512] - - [19, 14221.5] - - - [2048, 512, 1, 512, 2048, 2048, 2048, 512] - - [16, 17068.4] - - - [2560, 512, 1, 512, 2560, 2560, 2560, 512] - - [19, 18650.2] - - - [3072, 512, 1, 512, 3072, 3072, 3072, 512] - - [5, 20663.2] - - - [512, 1024, 1, 512, 512, 512, 512, 1024] - - [19, 7379.97] - - - [1024, 1024, 1, 512, 1024, 1024, 1024, 1024] - - [5, 16821.9] - - - [1536, 1024, 1, 512, 1536, 1536, 1536, 1024] - - [5, 20655.0] - - - [2048, 1024, 1, 512, 2048, 2048, 2048, 1024] - - [19, 22773.6] - - - [2560, 1024, 1, 512, 2560, 2560, 2560, 1024] - - [16, 24229.7] - - - [3072, 1024, 1, 512, 3072, 3072, 3072, 1024] - - [19, 25706.9] - - - [512, 1536, 1, 512, 512, 512, 512, 1536] - - [5, 14320.4] - - - [1024, 1536, 1, 512, 1024, 1024, 1024, 1536] - - [5, 20400.7] - - - [1536, 1536, 1, 512, 1536, 1536, 1536, 1536] - - [5, 23523.4] - - - [2048, 1536, 1, 512, 2048, 2048, 2048, 1536] - - [16, 26308.4] - - - [2560, 1536, 1, 512, 2560, 2560, 2560, 1536] - - [19, 27021.9] - - - [3072, 1536, 1, 512, 3072, 3072, 3072, 1536] - - [5, 29369.5] - - - [512, 2048, 1, 512, 512, 512, 512, 2048] - - [5, 16680.5] - - - [1024, 2048, 1, 512, 1024, 1024, 1024, 2048] - - [19, 22463.2] - - - [1536, 2048, 1, 512, 1536, 1536, 1536, 2048] - - [16, 25764.7] - - - [2048, 2048, 1, 512, 2048, 2048, 2048, 2048] - - [5, 27882.2] - - - [2560, 2048, 1, 512, 2560, 2560, 2560, 2048] - - [5, 29294.8] - - - [3072, 2048, 1, 512, 3072, 3072, 3072, 2048] - - [16, 30521.8] - - - [512, 2560, 1, 512, 512, 512, 512, 2560] - - [19, 18251.5] - - - [1024, 2560, 1, 512, 1024, 1024, 1024, 2560] - - [16, 19232.6] - - - [1536, 2560, 1, 512, 1536, 1536, 1536, 2560] - - [16, 27498.9] - - - [2048, 2560, 1, 512, 2048, 2048, 2048, 2560] - - [19, 29866.3] - - - [2560, 2560, 1, 512, 2560, 2560, 2560, 2560] - - [19, 30820.6] - - - [3072, 2560, 1, 512, 3072, 3072, 3072, 2560] - - [5, 32075.9] - - - [512, 3072, 1, 512, 512, 512, 512, 3072] - - [22, 15962.5] - - - [1024, 3072, 1, 512, 1024, 1024, 1024, 3072] - - [16, 25657.7] - - - [1536, 3072, 1, 512, 1536, 1536, 1536, 3072] - - [16, 28584.3] - - - [2048, 3072, 1, 512, 2048, 2048, 2048, 3072] - - [5, 30594.4] - - - [2560, 3072, 1, 512, 2560, 2560, 2560, 3072] - - [1, 30950.8] - - - [3072, 3072, 1, 512, 3072, 3072, 3072, 3072] - - [16, 32420.7] - - - [512, 512, 1, 1024, 512, 512, 512, 512] - - [10, 10387.0] - - - [1024, 512, 1, 1024, 1024, 1024, 1024, 512] - - [19, 17022.2] - - - [1536, 512, 1, 1024, 1536, 1536, 1536, 512] - - [5, 20059.7] - - - [2048, 512, 1, 1024, 2048, 2048, 2048, 512] - - [19, 22638.7] - - - [2560, 512, 1, 1024, 2560, 2560, 2560, 512] - - [20, 23365.2] - - - [3072, 512, 1, 1024, 3072, 3072, 3072, 512] - - [5, 26126.8] - - - [512, 1024, 1, 1024, 512, 512, 512, 1024] - - [0, 13164.4] - - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] - - [19, 23349.1] - - - [1536, 1024, 1, 1024, 1536, 1536, 1536, 1024] - - [16, 26046.3] - - - [2048, 1024, 1, 1024, 2048, 2048, 2048, 1024] - - [16, 28018.9] - - - [2560, 1024, 1, 1024, 2560, 2560, 2560, 1024] - - [5, 29343.0] - - - [3072, 1024, 1, 1024, 3072, 3072, 3072, 1024] - - [19, 30870.5] - - - [512, 1536, 1, 1024, 512, 512, 512, 1536] - - [19, 19795.4] - - - [1024, 1536, 1, 1024, 1024, 1024, 1024, 1536] - - [19, 25787.3] - - - [1536, 1536, 1, 1024, 1536, 1536, 1536, 1536] - - [16, 24877.5] - - - [2048, 1536, 1, 1024, 2048, 2048, 2048, 1536] - - [1, 30148.3] - - - [2560, 1536, 1, 1024, 2560, 2560, 2560, 1536] - - [19, 32041.5] - - - [3072, 1536, 1, 1024, 3072, 3072, 3072, 1536] - - [5, 32160.4] - - - [512, 2048, 1, 1024, 512, 512, 512, 2048] - - [19, 23000.9] - - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 2048] - - [19, 27958.6] - - - [1536, 2048, 1, 1024, 1536, 1536, 1536, 2048] - - [20, 30353.3] - - - [2048, 2048, 1, 1024, 2048, 2048, 2048, 2048] - - [16, 32910.4] - - - [2560, 2048, 1, 1024, 2560, 2560, 2560, 2048] - - [1, 33207.0] - - - [3072, 2048, 1, 1024, 3072, 3072, 3072, 2048] - - [1, 34410.8] - - - [512, 2560, 1, 1024, 512, 512, 512, 2560] - - [19, 19293.7] - - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 2560] - - [20, 28586.9] - - - [1536, 2560, 1, 1024, 1536, 1536, 1536, 2560] - - [5, 31516.4] - - - [2048, 2560, 1, 1024, 2048, 2048, 2048, 2560] - - [5, 32695.5] - - - [2560, 2560, 1, 1024, 2560, 2560, 2560, 2560] - - [16, 33584.5] - - - [3072, 2560, 1, 1024, 3072, 3072, 3072, 2560] - - [5, 34444.3] - - - [512, 3072, 1, 1024, 512, 512, 512, 3072] - - [5, 26250.5] - - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 3072] - - [16, 30288.9] - - - [1536, 3072, 1, 1024, 1536, 1536, 1536, 3072] - - [19, 32653.5] - - - [2048, 3072, 1, 1024, 2048, 2048, 2048, 3072] - - [19, 34049.1] - - - [2560, 3072, 1, 1024, 2560, 2560, 2560, 3072] - - [16, 34721.0] - - - [3072, 3072, 1, 1024, 3072, 3072, 3072, 3072] - - [5, 34981.8] - - - [512, 512, 1, 2048, 512, 512, 512, 512] - - [11, 15358.5] - - - [1024, 512, 1, 2048, 1024, 1024, 1024, 512] - - [1, 23129.8] - - - [1536, 512, 1, 2048, 1536, 1536, 1536, 512] - - [19, 24494.7] - - - [2048, 512, 1, 2048, 2048, 2048, 2048, 512] - - [5, 27958.6] - - - [2560, 512, 1, 2048, 2560, 2560, 2560, 512] - - [16, 28495.7] - - - [3072, 512, 1, 2048, 3072, 3072, 3072, 512] - - [19, 29965.7] - - - [512, 1024, 1, 2048, 512, 512, 512, 1024] - - [19, 22924.3] - - - [1024, 1024, 1, 2048, 1024, 1024, 1024, 1024] - - [16, 27329.3] - - - [1536, 1024, 1, 2048, 1536, 1536, 1536, 1024] - - [19, 30559.4] - - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 1024] - - [19, 31352.9] - - - [2560, 1024, 1, 2048, 2560, 2560, 2560, 1024] - - [16, 32903.2] - - - [3072, 1024, 1, 2048, 3072, 3072, 3072, 1024] - - [19, 34230.3] - - - [512, 1536, 1, 2048, 512, 512, 512, 1536] - - [19, 24394.2] - - - [1024, 1536, 1, 2048, 1024, 1024, 1024, 1536] - - [5, 29957.4] - - - [1536, 1536, 1, 2048, 1536, 1536, 1536, 1536] - - [5, 32132.4] - - - [2048, 1536, 1, 2048, 2048, 2048, 2048, 1536] - - [5, 32967.6] - - - [2560, 1536, 1, 2048, 2560, 2560, 2560, 1536] - - [19, 34108.8] - - - [3072, 1536, 1, 2048, 3072, 3072, 3072, 1536] - - [5, 35303.1] - - - [512, 2048, 1, 2048, 512, 512, 512, 2048] - - [19, 27543.7] - - - [1024, 2048, 1, 2048, 1024, 1024, 1024, 2048] - - [19, 31852.7] - - - [1536, 2048, 1, 2048, 1536, 1536, 1536, 2048] - - [5, 33429.5] - - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] - - [5, 35261.2] - - - [2560, 2048, 1, 2048, 2560, 2560, 2560, 2048] - - [19, 35012.8] - - - [3072, 2048, 1, 2048, 3072, 3072, 3072, 2048] - - [5, 34927.5] - - - [512, 2560, 1, 2048, 512, 512, 512, 2560] - - [19, 28473.1] - - - [1024, 2560, 1, 2048, 1024, 1024, 1024, 2560] - - [16, 32443.1] - - - [1536, 2560, 1, 2048, 1536, 1536, 1536, 2560] - - [19, 33703.4] - - - [2048, 2560, 1, 2048, 2048, 2048, 2048, 2560] - - [19, 34952.8] - - - [2560, 2560, 1, 2048, 2560, 2560, 2560, 2560] - - [19, 35427.5] - - - [3072, 2560, 1, 2048, 3072, 3072, 3072, 2560] - - [19, 35568.4] - - - [512, 3072, 1, 2048, 512, 512, 512, 3072] - - [1, 30643.9] - - - [1024, 3072, 1, 2048, 1024, 1024, 1024, 3072] - - [5, 33135.0] - - - [1536, 3072, 1, 2048, 1536, 1536, 1536, 3072] - - [19, 35086.0] - - - [2048, 3072, 1, 2048, 2048, 2048, 2048, 3072] - - [16, 35286.5] - - - [2560, 3072, 1, 2048, 2560, 2560, 2560, 3072] - - [5, 35540.8] - - - [3072, 3072, 1, 2048, 3072, 3072, 3072, 3072] - - [19, 35994.4] - - - [512, 512, 1, 3072, 512, 512, 512, 512] - - [10, 17907.8] - - - [1024, 512, 1, 3072, 1024, 1024, 1024, 512] - - [16, 25523.4] - - - [1536, 512, 1, 3072, 1536, 1536, 1536, 512] - - [19, 26395.7] - - - [2048, 512, 1, 3072, 2048, 2048, 2048, 512] - - [16, 29236.7] - - - [2560, 512, 1, 3072, 2560, 2560, 2560, 512] - - [5, 30260.9] - - - [3072, 512, 1, 3072, 3072, 3072, 3072, 512] - - [19, 31219.3] - - - [512, 1024, 1, 3072, 512, 512, 512, 1024] - - [3, 24676.9] - - - [1024, 1024, 1, 3072, 1024, 1024, 1024, 1024] - - [19, 29555.2] - - - [1536, 1024, 1, 3072, 1536, 1536, 1536, 1024] - - [5, 31553.2] - - - [2048, 1024, 1, 3072, 2048, 2048, 2048, 1024] - - [5, 32571.0] - - - [2560, 1024, 1, 3072, 2560, 2560, 2560, 1024] - - [16, 33635.7] - - - [3072, 1024, 1, 3072, 3072, 3072, 3072, 1024] - - [1, 34003.7] - - - [512, 1536, 1, 3072, 512, 512, 512, 1536] - - [5, 26965.3] - - - [1024, 1536, 1, 3072, 1024, 1024, 1024, 1536] - - [16, 31697.3] - - - [1536, 1536, 1, 3072, 1536, 1536, 1536, 1536] - - [19, 33471.2] - - - [2048, 1536, 1, 3072, 2048, 2048, 2048, 1536] - - [19, 34142.1] - - - [2560, 1536, 1, 3072, 2560, 2560, 2560, 1536] - - [5, 34520.7] - - - [3072, 1536, 1, 3072, 3072, 3072, 3072, 1536] - - [1, 35005.5] - - - [512, 2048, 1, 3072, 512, 512, 512, 2048] - - [1, 30031.5] - - - [1024, 2048, 1, 3072, 1024, 1024, 1024, 2048] - - [19, 33287.7] - - - [1536, 2048, 1, 3072, 1536, 1536, 1536, 2048] - - [16, 34538.9] - - - [2048, 2048, 1, 3072, 2048, 2048, 2048, 2048] - - [5, 34890.2] - - - [2560, 2048, 1, 3072, 2560, 2560, 2560, 2048] - - [19, 35540.9] - - - [3072, 2048, 1, 3072, 3072, 3072, 3072, 2048] - - [19, 35493.7] - - - [512, 2560, 1, 3072, 512, 512, 512, 2560] - - [19, 30412.2] - - - [1024, 2560, 1, 3072, 1024, 1024, 1024, 2560] - - [16, 34182.2] - - - [1536, 2560, 1, 3072, 1536, 1536, 1536, 2560] - - [5, 34612.4] - - - [2048, 2560, 1, 3072, 2048, 2048, 2048, 2560] - - [5, 34945.9] - - - [2560, 2560, 1, 3072, 2560, 2560, 2560, 2560] - - [19, 35803.3] - - - [3072, 2560, 1, 3072, 3072, 3072, 3072, 2560] - - [1, 35815.1] - - - [512, 3072, 1, 3072, 512, 512, 512, 3072] - - [1, 32031.1] - - - [1024, 3072, 1, 3072, 1024, 1024, 1024, 3072] - - [19, 33794.0] - - - [1536, 3072, 1, 3072, 1536, 1536, 1536, 3072] - - [1, 34947.1] - - - [2048, 3072, 1, 3072, 2048, 2048, 2048, 3072] - - [19, 35799.6] - - - [2560, 3072, 1, 3072, 2560, 2560, 2560, 3072] - - [1, 35674.7] - - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] - - [19, 36047.4] - - - [1, 1, 1, 1, 1, 1, 1, 1] - - [13, 6.37979e-05] - - - [1, 1, 1, 64, 1, 1, 1, 1] - - [15, 0.004] - - - [1, 64, 1, 1, 1, 1, 1, 64] - - [13, 0.00423883] - - - [64, 1, 1, 1, 64, 64, 64, 1] - - [13, 0.00425858] - - - [64, 64, 1, 1, 64, 64, 64, 64] - - [9, 0.269323] - - - [64, 1, 1, 64, 64, 64, 64, 1] - - [12, 0.267914] - - - [1, 64, 1, 64, 1, 1, 1, 64] - - [15, 0.266945] - - - [64, 64, 1, 64, 64, 64, 64, 64] - - [14, 17.0789] - - - [64, 64, 1, 256, 64, 64, 64, 64] - - [14, 62.8228] - - - [64, 64, 1, 512, 64, 64, 64, 64] - - [21, 117.235] - - - [64, 64, 1, 1024, 64, 64, 64, 64] - - [9, 196.083] - - - [64, 64, 1, 2048, 64, 64, 64, 64] - - [21, 307.602] - - - [64, 64, 1, 4096, 64, 64, 64, 64] - - [9, 442.484] -- null -- null -- DeviceEfficiency -- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml deleted file mode 100644 index d136ffeb70f..00000000000 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml +++ /dev/null @@ -1,7273 +0,0 @@ -- {MinimumRequiredVersion: 4.33.0} -- gfx1200 -- gfx1200 -- [Device 73f0] -- Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false -- - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 0 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA256_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 16 - LSCB: 32 - LSPA: 8 - LSPB: 4 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14464 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1152 - LdsOffsetMetadata_Blk: 9344 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA256_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 7 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 8 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB512_MIWT1_1_NLCA1_PGR1_PLR3_SS0_SVW8_TLDS0_WG32_4_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 9 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB512_MIWT1_1_NLCA1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_TLDS0_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR3_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 10 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29952 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 11 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 12 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 13 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 14 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 15 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 16 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA256_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 16 - LSCB: 32 - LSPA: 8 - LSPB: 4 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14464 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1152 - LdsOffsetMetadata_Blk: 9344 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 17 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA256_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA256_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 16 - LSCB: 32 - LSPA: 8 - LSPB: 4 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14464 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1152 - LdsOffsetMetadata_Blk: 9344 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 18 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA256_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 19 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 20 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 21 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR3_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 22 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR3_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 23 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR3_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29952 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 24 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_CLR0_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 30976 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 10240 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 25 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 -- [2, 3, 0, 1] -- - - [512, 512, 1, 64, 512, 512, 512, 64] - - [16, 1102.78] - - - [1024, 512, 1, 64, 1024, 1024, 1024, 64] - - [6, 2110.34] - - - [1536, 512, 1, 64, 1536, 1536, 1536, 64] - - [1, 2716.22] - - - [2048, 512, 1, 64, 2048, 2048, 2048, 64] - - [2, 3824.3] - - - [2560, 512, 1, 64, 2560, 2560, 2560, 64] - - [3, 4454.8] - - - [3072, 512, 1, 64, 3072, 3072, 3072, 64] - - [17, 4254.67] - - - [512, 1024, 1, 64, 512, 512, 512, 64] - - [14, 2122.35] - - - [1024, 1024, 1, 64, 1024, 1024, 1024, 64] - - [20, 3584.78] - - - [1536, 1024, 1, 64, 1536, 1536, 1536, 64] - - [3, 5123.6] - - - [2048, 1024, 1, 64, 2048, 2048, 2048, 64] - - [3, 6276.11] - - - [2560, 1024, 1, 64, 2560, 2560, 2560, 64] - - [6, 6942.63] - - - [3072, 1024, 1, 64, 3072, 3072, 3072, 64] - - [7, 7808.35] - - - [512, 1536, 1, 64, 512, 512, 512, 64] - - [4, 2789.38] - - - [1024, 1536, 1, 64, 1024, 1024, 1024, 64] - - [3, 5102.82] - - - [1536, 1536, 1, 64, 1536, 1536, 1536, 64] - - [2, 6741.75] - - - [2048, 1536, 1, 64, 2048, 2048, 2048, 64] - - [3, 8102.81] - - - [2560, 1536, 1, 64, 2560, 2560, 2560, 64] - - [2, 9207.63] - - - [3072, 1536, 1, 64, 3072, 3072, 3072, 64] - - [3, 10299.8] - - - [512, 2048, 1, 64, 512, 512, 512, 64] - - [7, 3766.25] - - - [1024, 2048, 1, 64, 1024, 1024, 1024, 64] - - [2, 6254.18] - - - [1536, 2048, 1, 64, 1536, 1536, 1536, 64] - - [3, 7919.07] - - - [2048, 2048, 1, 64, 2048, 2048, 2048, 64] - - [2, 9321.0] - - - [2560, 2048, 1, 64, 2560, 2560, 2560, 64] - - [3, 10635.7] - - - [3072, 2048, 1, 64, 3072, 3072, 3072, 64] - - [3, 12069.0] - - - [512, 2560, 1, 64, 512, 512, 512, 64] - - [3, 4218.03] - - - [1024, 2560, 1, 64, 1024, 1024, 1024, 64] - - [2, 7070.04] - - - [1536, 2560, 1, 64, 1536, 1536, 1536, 64] - - [6, 9325.68] - - - [2048, 2560, 1, 64, 2048, 2048, 2048, 64] - - [3, 11009.6] - - - [2560, 2560, 1, 64, 2560, 2560, 2560, 64] - - [3, 12362.0] - - - [3072, 2560, 1, 64, 3072, 3072, 3072, 64] - - [7, 13337.8] - - - [512, 3072, 1, 64, 512, 512, 512, 64] - - [6, 5155.08] - - - [1024, 3072, 1, 64, 1024, 1024, 1024, 64] - - [7, 7920.79] - - - [1536, 3072, 1, 64, 1536, 1536, 1536, 64] - - [6, 10022.4] - - - [2048, 3072, 1, 64, 2048, 2048, 2048, 64] - - [3, 12052.6] - - - [2560, 3072, 1, 64, 2560, 2560, 2560, 64] - - [3, 13362.6] - - - [3072, 3072, 1, 64, 3072, 3072, 3072, 64] - - [2, 14403.3] - - - [512, 512, 1, 256, 512, 512, 512, 256] - - [0, 3747.32] - - - [1024, 512, 1, 256, 1024, 1024, 1024, 256] - - [6, 7243.46] - - - [1536, 512, 1, 256, 1536, 1536, 1536, 256] - - [8, 7307.28] - - - [2048, 512, 1, 256, 2048, 2048, 2048, 256] - - [2, 11509.0] - - - [2560, 512, 1, 256, 2560, 2560, 2560, 256] - - [2, 12441.2] - - - [3072, 512, 1, 256, 3072, 3072, 3072, 256] - - [14, 14099.2] - - - [512, 1024, 1, 256, 512, 512, 512, 256] - - [14, 7025.08] - - - [1024, 1024, 1, 256, 1024, 1024, 1024, 256] - - [6, 11538.9] - - - [1536, 1024, 1, 256, 1536, 1536, 1536, 256] - - [6, 14617.0] - - - [2048, 1024, 1, 256, 2048, 2048, 2048, 256] - - [2, 16680.5] - - - [2560, 1024, 1, 256, 2560, 2560, 2560, 256] - - [6, 18354.1] - - - [3072, 1024, 1, 256, 3072, 3072, 3072, 256] - - [2, 20047.2] - - - [512, 1536, 1, 256, 512, 512, 512, 256] - - [2, 9348.59] - - - [1024, 1536, 1, 256, 1024, 1024, 1024, 256] - - [6, 14485.5] - - - [1536, 1536, 1, 256, 1536, 1536, 1536, 256] - - [6, 17521.9] - - - [2048, 1536, 1, 256, 2048, 2048, 2048, 256] - - [2, 20114.8] - - - [2560, 1536, 1, 256, 2560, 2560, 2560, 256] - - [6, 22041.2] - - - [3072, 1536, 1, 256, 3072, 3072, 3072, 256] - - [2, 23013.8] - - - [512, 2048, 1, 256, 512, 512, 512, 256] - - [2, 11085.3] - - - [1024, 2048, 1, 256, 1024, 1024, 1024, 256] - - [2, 16850.9] - - - [1536, 2048, 1, 256, 1536, 1536, 1536, 256] - - [2, 20295.0] - - - [2048, 2048, 1, 256, 2048, 2048, 2048, 256] - - [2, 22703.6] - - - [2560, 2048, 1, 256, 2560, 2560, 2560, 256] - - [2, 24267.1] - - - [3072, 2048, 1, 256, 3072, 3072, 3072, 256] - - [6, 26107.7] - - - [512, 2560, 1, 256, 512, 512, 512, 256] - - [2, 12965.9] - - - [1024, 2560, 1, 256, 1024, 1024, 1024, 256] - - [2, 18366.6] - - - [1536, 2560, 1, 256, 1536, 1536, 1536, 256] - - [2, 22004.8] - - - [2048, 2560, 1, 256, 2048, 2048, 2048, 256] - - [2, 24317.7] - - - [2560, 2560, 1, 256, 2560, 2560, 2560, 256] - - [2, 25695.5] - - - [3072, 2560, 1, 256, 3072, 3072, 3072, 256] - - [2, 26936.9] - - - [512, 3072, 1, 256, 512, 512, 512, 256] - - [14, 14423.2] - - - [1024, 3072, 1, 256, 1024, 1024, 1024, 256] - - [6, 20027.0] - - - [1536, 3072, 1, 256, 1536, 1536, 1536, 256] - - [2, 23488.8] - - - [2048, 3072, 1, 256, 2048, 2048, 2048, 256] - - [19, 25372.4] - - - [2560, 3072, 1, 256, 2560, 2560, 2560, 256] - - [2, 26893.4] - - - [3072, 3072, 1, 256, 3072, 3072, 3072, 256] - - [6, 28682.9] - - - [512, 512, 1, 512, 512, 512, 512, 512] - - [11, 6659.94] - - - [1024, 512, 1, 512, 1024, 1024, 1024, 512] - - [14, 11566.3] - - - [1536, 512, 1, 512, 1536, 1536, 1536, 512] - - [6, 14604.0] - - - [2048, 512, 1, 512, 2048, 2048, 2048, 512] - - [14, 17389.7] - - - [2560, 512, 1, 512, 2560, 2560, 2560, 512] - - [6, 19021.2] - - - [3072, 512, 1, 512, 3072, 3072, 3072, 512] - - [19, 20904.8] - - - [512, 1024, 1, 512, 512, 512, 512, 512] - - [14, 11964.0] - - - [1024, 1024, 1, 512, 1024, 1024, 1024, 512] - - [6, 17177.7] - - - [1536, 1024, 1, 512, 1536, 1536, 1536, 512] - - [2, 20896.7] - - - [2048, 1024, 1, 512, 2048, 2048, 2048, 512] - - [2, 17762.9] - - - [2560, 1024, 1, 512, 2560, 2560, 2560, 512] - - [19, 25167.4] - - - [3072, 1024, 1, 512, 3072, 3072, 3072, 512] - - [6, 26659.8] - - - [512, 1536, 1, 512, 512, 512, 512, 512] - - [2, 14577.3] - - - [1024, 1536, 1, 512, 1024, 1024, 1024, 512] - - [25, 15994.3] - - - [1536, 1536, 1, 512, 1536, 1536, 1536, 512] - - [14, 23856.2] - - - [2048, 1536, 1, 512, 2048, 2048, 2048, 512] - - [14, 26373.2] - - - [2560, 1536, 1, 512, 2560, 2560, 2560, 512] - - [6, 27608.5] - - - [3072, 1536, 1, 512, 3072, 3072, 3072, 512] - - [2, 29444.8] - - - [512, 2048, 1, 512, 512, 512, 512, 512] - - [6, 17566.3] - - - [1024, 2048, 1, 512, 1024, 1024, 1024, 512] - - [2, 22682.0] - - - [1536, 2048, 1, 512, 1536, 1536, 1536, 512] - - [14, 26297.9] - - - [2048, 2048, 1, 512, 2048, 2048, 2048, 512] - - [2, 28275.7] - - - [2560, 2048, 1, 512, 2560, 2560, 2560, 512] - - [2, 29643.4] - - - [3072, 2048, 1, 512, 3072, 3072, 3072, 512] - - [2, 31025.5] - - - [512, 2560, 1, 512, 512, 512, 512, 512] - - [14, 18754.9] - - - [1024, 2560, 1, 512, 1024, 1024, 1024, 512] - - [2, 24558.2] - - - [1536, 2560, 1, 512, 1536, 1536, 1536, 512] - - [6, 27629.3] - - - [2048, 2560, 1, 512, 2048, 2048, 2048, 512] - - [6, 29677.8] - - - [2560, 2560, 1, 512, 2560, 2560, 2560, 512] - - [6, 31297.4] - - - [3072, 2560, 1, 512, 3072, 3072, 3072, 512] - - [6, 32040.3] - - - [512, 3072, 1, 512, 512, 512, 512, 512] - - [2, 20975.6] - - - [1024, 3072, 1, 512, 1024, 1024, 1024, 512] - - [2, 26490.8] - - - [1536, 3072, 1, 512, 1536, 1536, 1536, 512] - - [14, 29040.5] - - - [2048, 3072, 1, 512, 2048, 2048, 2048, 512] - - [6, 31019.4] - - - [2560, 3072, 1, 512, 2560, 2560, 2560, 512] - - [2, 31626.8] - - - [3072, 3072, 1, 512, 3072, 3072, 3072, 512] - - [2, 33263.9] - - - [512, 512, 1, 1024, 512, 512, 512, 1024] - - [11, 10976.3] - - - [1024, 512, 1, 1024, 1024, 1024, 1024, 1024] - - [14, 17862.1] - - - [1536, 512, 1, 1024, 1536, 1536, 1536, 1024] - - [24, 17439.6] - - - [2048, 512, 1, 1024, 2048, 2048, 2048, 1024] - - [14, 23430.8] - - - [2560, 512, 1, 1024, 2560, 2560, 2560, 1024] - - [14, 24764.8] - - - [3072, 512, 1, 1024, 3072, 3072, 3072, 1024] - - [2, 26835.5] - - - [512, 1024, 1, 1024, 512, 512, 512, 1024] - - [19, 18085.3] - - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] - - [2, 23671.6] - - - [1536, 1024, 1, 1024, 1536, 1536, 1536, 1024] - - [2, 26453.8] - - - [2048, 1024, 1, 1024, 2048, 2048, 2048, 1024] - - [2, 28648.2] - - - [2560, 1024, 1, 1024, 2560, 2560, 2560, 1024] - - [14, 29639.9] - - - [3072, 1024, 1, 1024, 3072, 3072, 3072, 1024] - - [6, 30994.0] - - - [512, 1536, 1, 1024, 512, 512, 512, 1024] - - [19, 20362.0] - - - [1024, 1536, 1, 1024, 1024, 1024, 1024, 1024] - - [19, 26651.0] - - - [1536, 1536, 1, 1024, 1536, 1536, 1536, 1024] - - [14, 28667.6] - - - [2048, 1536, 1, 1024, 2048, 2048, 2048, 1024] - - [2, 31269.9] - - - [2560, 1536, 1, 1024, 2560, 2560, 2560, 1024] - - [19, 31786.8] - - - [3072, 1536, 1, 1024, 3072, 3072, 3072, 1024] - - [2, 32638.1] - - - [512, 2048, 1, 1024, 512, 512, 512, 1024] - - [19, 23477.0] - - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 1024] - - [2, 28711.4] - - - [1536, 2048, 1, 1024, 1536, 1536, 1536, 1024] - - [2, 31338.4] - - - [2048, 2048, 1, 1024, 2048, 2048, 2048, 1024] - - [2, 32280.9] - - - [2560, 2048, 1, 1024, 2560, 2560, 2560, 1024] - - [14, 33493.4] - - - [3072, 2048, 1, 1024, 3072, 3072, 3072, 1024] - - [2, 33769.3] - - - [512, 2560, 1, 1024, 512, 512, 512, 1024] - - [6, 24833.8] - - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 1024] - - [2, 29926.4] - - - [1536, 2560, 1, 1024, 1536, 1536, 1536, 1024] - - [6, 31938.4] - - - [2048, 2560, 1, 1024, 2048, 2048, 2048, 1024] - - [14, 33211.2] - - - [2560, 2560, 1, 1024, 2560, 2560, 2560, 1024] - - [2, 34754.8] - - - [3072, 2560, 1, 1024, 3072, 3072, 3072, 1024] - - [14, 35387.6] - - - [512, 3072, 1, 1024, 512, 512, 512, 1024] - - [14, 26864.8] - - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 1024] - - [14, 31058.4] - - - [1536, 3072, 1, 1024, 1536, 1536, 1536, 1024] - - [14, 32940.1] - - - [2048, 3072, 1, 1024, 2048, 2048, 2048, 1024] - - [14, 34567.0] - - - [2560, 3072, 1, 1024, 2560, 2560, 2560, 1024] - - [14, 34955.1] - - - [3072, 3072, 1, 1024, 3072, 3072, 3072, 1024] - - [2, 35700.4] - - - [512, 512, 1, 2048, 512, 512, 512, 2048] - - [12, 15600.1] - - - [1024, 512, 1, 2048, 1024, 1024, 1024, 2048] - - [19, 23765.9] - - - [1536, 512, 1, 2048, 1536, 1536, 1536, 2048] - - [19, 20093.5] - - - [2048, 512, 1, 2048, 2048, 2048, 2048, 2048] - - [6, 27942.4] - - - [2560, 512, 1, 2048, 2560, 2560, 2560, 2048] - - [6, 28709.5] - - - [3072, 512, 1, 2048, 3072, 3072, 3072, 2048] - - [6, 30761.4] - - - [512, 1024, 1, 2048, 512, 512, 512, 2048] - - [6, 23871.8] - - - [1024, 1024, 1, 2048, 1024, 1024, 1024, 2048] - - [19, 28220.0] - - - [1536, 1024, 1, 2048, 1536, 1536, 1536, 2048] - - [14, 30778.9] - - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 2048] - - [2, 32535.7] - - - [2560, 1024, 1, 2048, 2560, 2560, 2560, 2048] - - [19, 33459.0] - - - [3072, 1024, 1, 2048, 3072, 3072, 3072, 2048] - - [19, 34043.7] - - - [512, 1536, 1, 2048, 512, 512, 512, 2048] - - [14, 24983.7] - - - [1024, 1536, 1, 2048, 1024, 1024, 1024, 2048] - - [2, 30956.8] - - - [1536, 1536, 1, 2048, 1536, 1536, 1536, 2048] - - [2, 32333.8] - - - [2048, 1536, 1, 2048, 2048, 2048, 2048, 2048] - - [2, 33769.3] - - - [2560, 1536, 1, 2048, 2560, 2560, 2560, 2048] - - [14, 34579.2] - - - [3072, 1536, 1, 2048, 3072, 3072, 3072, 2048] - - [19, 34607.7] - - - [512, 2048, 1, 2048, 512, 512, 512, 2048] - - [15, 27721.8] - - - [1024, 2048, 1, 2048, 1024, 1024, 1024, 2048] - - [19, 32373.6] - - - [1536, 2048, 1, 2048, 1536, 1536, 1536, 2048] - - [2, 34114.2] - - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] - - [2, 34644.4] - - - [2560, 2048, 1, 2048, 2560, 2560, 2560, 2048] - - [2, 35409.6] - - - [3072, 2048, 1, 2048, 3072, 3072, 3072, 2048] - - [19, 36021.8] - - - [512, 2560, 1, 2048, 512, 512, 512, 2048] - - [2, 28845.4] - - - [1024, 2560, 1, 2048, 1024, 1024, 1024, 2048] - - [2, 33035.2] - - - [1536, 2560, 1, 2048, 1536, 1536, 1536, 2048] - - [2, 34694.1] - - - [2048, 2560, 1, 2048, 2048, 2048, 2048, 2048] - - [6, 35699.2] - - - [2560, 2560, 1, 2048, 2560, 2560, 2560, 2048] - - [19, 35409.2] - - - [3072, 2560, 1, 2048, 3072, 3072, 3072, 2048] - - [6, 35717.4] - - - [512, 3072, 1, 2048, 512, 512, 512, 2048] - - [2, 30754.1] - - - [1024, 3072, 1, 2048, 1024, 1024, 1024, 2048] - - [19, 34233.1] - - - [1536, 3072, 1, 2048, 1536, 1536, 1536, 2048] - - [14, 35127.6] - - - [2048, 3072, 1, 2048, 2048, 2048, 2048, 2048] - - [19, 35648.4] - - - [2560, 3072, 1, 2048, 2560, 2560, 2560, 2048] - - [6, 35884.1] - - - [3072, 3072, 1, 2048, 3072, 3072, 3072, 2048] - - [6, 37382.1] - - - [512, 512, 1, 3072, 512, 512, 512, 3072] - - [12, 18127.9] - - - [1024, 512, 1, 3072, 1024, 1024, 1024, 3072] - - [19, 26442.7] - - - [1536, 512, 1, 3072, 1536, 1536, 1536, 3072] - - [6, 26322.2] - - - [2048, 512, 1, 3072, 2048, 2048, 2048, 3072] - - [19, 30968.7] - - - [2560, 512, 1, 3072, 2560, 2560, 2560, 3072] - - [14, 30018.1] - - - [3072, 512, 1, 3072, 3072, 3072, 3072, 3072] - - [6, 32424.1] - - - [512, 1024, 1, 3072, 512, 512, 512, 3072] - - [2, 27073.0] - - - [1024, 1024, 1, 3072, 1024, 1024, 1024, 3072] - - [2, 30600.4] - - - [1536, 1024, 1, 3072, 1536, 1536, 1536, 3072] - - [19, 32482.0] - - - [2048, 1024, 1, 3072, 2048, 2048, 2048, 3072] - - [19, 33371.5] - - - [2560, 1024, 1, 3072, 2560, 2560, 2560, 3072] - - [14, 34360.5] - - - [3072, 1024, 1, 3072, 3072, 3072, 3072, 3072] - - [2, 34915.3] - - - [512, 1536, 1, 3072, 512, 512, 512, 3072] - - [2, 27553.8] - - - [1024, 1536, 1, 3072, 1024, 1024, 1024, 3072] - - [2, 32703.2] - - - [1536, 1536, 1, 3072, 1536, 1536, 1536, 3072] - - [6, 33662.8] - - - [2048, 1536, 1, 3072, 2048, 2048, 2048, 3072] - - [2, 34536.4] - - - [2560, 1536, 1, 3072, 2560, 2560, 2560, 3072] - - [6, 35147.2] - - - [3072, 1536, 1, 3072, 3072, 3072, 3072, 3072] - - [19, 35200.9] - - - [512, 2048, 1, 3072, 512, 512, 512, 3072] - - [2, 31780.0] - - - [1024, 2048, 1, 3072, 1024, 1024, 1024, 3072] - - [14, 33546.5] - - - [1536, 2048, 1, 3072, 1536, 1536, 1536, 3072] - - [14, 34553.2] - - - [2048, 2048, 1, 3072, 2048, 2048, 2048, 3072] - - [14, 35790.3] - - - [2560, 2048, 1, 3072, 2560, 2560, 2560, 3072] - - [2, 36033.7] - - - [3072, 2048, 1, 3072, 3072, 3072, 3072, 3072] - - [6, 36455.0] - - - [512, 2560, 1, 3072, 512, 512, 512, 3072] - - [6, 31027.4] - - - [1024, 2560, 1, 3072, 1024, 1024, 1024, 3072] - - [6, 35006.1] - - - [1536, 2560, 1, 3072, 1536, 1536, 1536, 3072] - - [19, 35047.6] - - - [2048, 2560, 1, 3072, 2048, 2048, 2048, 3072] - - [2, 36074.1] - - - [2560, 2560, 1, 3072, 2560, 2560, 2560, 3072] - - [14, 36488.0] - - - [3072, 2560, 1, 3072, 3072, 3072, 3072, 3072] - - [14, 36276.3] - - - [512, 3072, 1, 3072, 512, 512, 512, 3072] - - [2, 33344.3] - - - [1024, 3072, 1, 3072, 1024, 1024, 1024, 3072] - - [14, 35036.4] - - - [1536, 3072, 1, 3072, 1536, 1536, 1536, 3072] - - [14, 36277.7] - - - [2048, 3072, 1, 3072, 2048, 2048, 2048, 3072] - - [2, 35989.7] - - - [2560, 3072, 1, 3072, 2560, 2560, 2560, 3072] - - [6, 36792.8] - - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] - - [2, 37505.2] - - - [1, 1, 1, 1, 1, 1, 1, 1] - - [13, 6.53659e-05] - - - [1, 1, 1, 64, 1, 1, 1, 64] - - [23, 0.00415476] - - - [1, 64, 1, 1, 1, 1, 1, 1] - - [5, 0.00401506] - - - [64, 1, 1, 1, 64, 64, 64, 1] - - [12, 0.00426141] - - - [64, 64, 1, 1, 64, 64, 64, 1] - - [18, 0.268344] - - - [64, 1, 1, 64, 64, 64, 64, 64] - - [22, 0.267642] - - - [1, 64, 1, 64, 1, 1, 1, 64] - - [22, 0.266519] - - - [64, 64, 1, 64, 64, 64, 64, 64] - - [9, 16.5652] - - - [64, 64, 1, 256, 64, 64, 64, 256] - - [10, 65.3522] - - - [64, 64, 1, 512, 64, 64, 64, 512] - - [22, 116.908] - - - [64, 64, 1, 1024, 64, 64, 64, 1024] - - [22, 202.047] - - - [64, 64, 1, 2048, 64, 64, 64, 2048] - - [22, 318.541] - - - [64, 64, 1, 4096, 64, 64, 64, 4096] - - [21, 435.687] -- null -- null -- DeviceEfficiency -- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml deleted file mode 100644 index 9146dde94c8..00000000000 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml +++ /dev/null @@ -1,8050 +0,0 @@ -- {MinimumRequiredVersion: 4.33.0} -- gfx1200 -- gfx1200 -- [Device 73f0] -- Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false -- - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 0 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT2_1_PGR2_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB512_MIWT2_1_PGR2_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR2_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS0_SVW8_TLDS0_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS0_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR2_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 7 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 8 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB1024_MIWT2_1_PGR2_PLR0_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 9 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR2_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 10 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR3_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 11 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 30976 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 26624 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 26624 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 12 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR3_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 30976 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 26624 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 26624 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 13 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB1024_MIWT1_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 27392 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 18944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 18944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 14 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB1024_MIWT1_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB256_MIWT1_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG64_2_1 - LSCA: 32 - LSCB: 16 - LSPA: 4 - LSPB: 8 - LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14464 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 15 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB256_MIWT1_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG64_2_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [64, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 16 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT1_1_PGR2_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 17 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR2_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB512_MIWT1_1_PGR2_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 18 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 19 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 20 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 21 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 22 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA512_LBSPPB1024_MIWT2_1_PGR2_PLR0_SS1_SVW1_TLDS0_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 23 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS0_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 24 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 25 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 30976 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 26624 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 26624 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 26 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB1024_MIWT1_1_PGR1_PLR3_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 27392 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 18944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 18944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 27 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB1024_MIWT1_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR3_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29952 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 28 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 -- [2, 3, 0, 1] -- - - [512, 512, 1, 64, 512, 512, 64, 512] - - [4, 1083.17] - - - [1024, 512, 1, 64, 1024, 1024, 64, 512] - - [6, 2149.62] - - - [1536, 512, 1, 64, 1536, 1536, 64, 512] - - [3, 2862.44] - - - [2048, 512, 1, 64, 2048, 2048, 64, 512] - - [1, 3757.71] - - - [2560, 512, 1, 64, 2560, 2560, 64, 512] - - [9, 4330.39] - - - [3072, 512, 1, 64, 3072, 3072, 64, 512] - - [1, 5161.69] - - - [512, 1024, 1, 64, 512, 512, 64, 1024] - - [19, 2075.04] - - - [1024, 1024, 1, 64, 1024, 1024, 64, 1024] - - [20, 3608.97] - - - [1536, 1024, 1, 64, 1536, 1536, 64, 1024] - - [23, 4639.72] - - - [2048, 1024, 1, 64, 2048, 2048, 64, 1024] - - [1, 6143.81] - - - [2560, 1024, 1, 64, 2560, 2560, 64, 1024] - - [1, 7283.99] - - - [3072, 1024, 1, 64, 3072, 3072, 64, 1024] - - [6, 7999.47] - - - [512, 1536, 1, 64, 512, 512, 64, 1536] - - [6, 2970.91] - - - [1024, 1536, 1, 64, 1024, 1024, 64, 1536] - - [8, 5152.58] - - - [1536, 1536, 1, 64, 1536, 1536, 64, 1536] - - [6, 6800.96] - - - [2048, 1536, 1, 64, 2048, 2048, 64, 1536] - - [8, 8088.0] - - - [2560, 1536, 1, 64, 2560, 2560, 64, 1536] - - [2, 9133.94] - - - [3072, 1536, 1, 64, 3072, 3072, 64, 1536] - - [6, 10264.6] - - - [512, 2048, 1, 64, 512, 512, 64, 2048] - - [19, 3696.54] - - - [1024, 2048, 1, 64, 1024, 1024, 64, 2048] - - [1, 6298.34] - - - [1536, 2048, 1, 64, 1536, 1536, 64, 2048] - - [19, 7790.22] - - - [2048, 2048, 1, 64, 2048, 2048, 64, 2048] - - [7, 9700.44] - - - [2560, 2048, 1, 64, 2560, 2560, 64, 2048] - - [9, 10984.3] - - - [3072, 2048, 1, 64, 3072, 3072, 64, 2048] - - [9, 12022.0] - - - [512, 2560, 1, 64, 512, 512, 64, 2560] - - [20, 4391.71] - - - [1024, 2560, 1, 64, 1024, 1024, 64, 2560] - - [6, 7101.62] - - - [1536, 2560, 1, 64, 1536, 1536, 64, 2560] - - [1, 9360.37] - - - [2048, 2560, 1, 64, 2048, 2048, 64, 2560] - - [1, 11006.0] - - - [2560, 2560, 1, 64, 2560, 2560, 64, 2560] - - [8, 12365.6] - - - [3072, 2560, 1, 64, 3072, 3072, 64, 2560] - - [7, 13479.1] - - - [512, 3072, 1, 64, 512, 512, 64, 3072] - - [0, 4434.99] - - - [1024, 3072, 1, 64, 1024, 1024, 64, 3072] - - [6, 8175.2] - - - [1536, 3072, 1, 64, 1536, 1536, 64, 3072] - - [6, 10493.2] - - - [2048, 3072, 1, 64, 2048, 2048, 64, 3072] - - [7, 11785.7] - - - [2560, 3072, 1, 64, 2560, 2560, 64, 3072] - - [7, 13452.3] - - - [3072, 3072, 1, 64, 3072, 3072, 64, 3072] - - [6, 14621.4] - - - [512, 512, 1, 256, 512, 512, 256, 512] - - [13, 3950.6] - - - [1024, 512, 1, 256, 1024, 1024, 256, 512] - - [21, 7257.17] - - - [1536, 512, 1, 256, 1536, 1536, 256, 512] - - [5, 7936.24] - - - [2048, 512, 1, 256, 2048, 2048, 256, 512] - - [1, 11596.0] - - - [2560, 512, 1, 256, 2560, 2560, 256, 512] - - [21, 12681.2] - - - [3072, 512, 1, 256, 3072, 3072, 256, 512] - - [22, 13454.7] - - - [512, 1024, 1, 256, 512, 512, 256, 1024] - - [27, 5362.92] - - - [1024, 1024, 1, 256, 1024, 1024, 256, 1024] - - [21, 11518.9] - - - [1536, 1024, 1, 256, 1536, 1536, 256, 1024] - - [6, 14363.5] - - - [2048, 1024, 1, 256, 2048, 2048, 256, 1024] - - [6, 16690.9] - - - [2560, 1024, 1, 256, 2560, 2560, 256, 1024] - - [1, 18860.1] - - - [3072, 1024, 1, 256, 3072, 3072, 256, 1024] - - [6, 19744.2] - - - [512, 1536, 1, 256, 512, 512, 256, 1536] - - [19, 9025.27] - - - [1024, 1536, 1, 256, 1024, 1024, 256, 1536] - - [1, 14474.8] - - - [1536, 1536, 1, 256, 1536, 1536, 256, 1536] - - [6, 17097.1] - - - [2048, 1536, 1, 256, 2048, 2048, 256, 1536] - - [6, 20107.5] - - - [2560, 1536, 1, 256, 2560, 2560, 256, 1536] - - [6, 21992.8] - - - [3072, 1536, 1, 256, 3072, 3072, 256, 1536] - - [6, 23393.3] - - - [512, 2048, 1, 256, 512, 512, 256, 2048] - - [19, 11514.2] - - - [1024, 2048, 1, 256, 1024, 1024, 256, 2048] - - [6, 16835.1] - - - [1536, 2048, 1, 256, 1536, 1536, 256, 2048] - - [6, 20200.8] - - - [2048, 2048, 1, 256, 2048, 2048, 256, 2048] - - [6, 21977.2] - - - [2560, 2048, 1, 256, 2560, 2560, 256, 2048] - - [6, 24251.5] - - - [3072, 2048, 1, 256, 3072, 3072, 256, 2048] - - [6, 25706.9] - - - [512, 2560, 1, 256, 512, 512, 256, 2560] - - [6, 8237.86] - - - [1024, 2560, 1, 256, 1024, 1024, 256, 2560] - - [6, 18794.3] - - - [1536, 2560, 1, 256, 1536, 1536, 256, 2560] - - [6, 21764.1] - - - [2048, 2560, 1, 256, 2048, 2048, 256, 2560] - - [6, 23881.9] - - - [2560, 2560, 1, 256, 2560, 2560, 256, 2560] - - [21, 25509.3] - - - [3072, 2560, 1, 256, 3072, 3072, 256, 2560] - - [6, 27707.3] - - - [512, 3072, 1, 256, 512, 512, 256, 3072] - - [8, 14498.3] - - - [1024, 3072, 1, 256, 1024, 1024, 256, 3072] - - [6, 19817.3] - - - [1536, 3072, 1, 256, 1536, 1536, 256, 3072] - - [6, 22961.1] - - - [2048, 3072, 1, 256, 2048, 2048, 256, 3072] - - [6, 25392.4] - - - [2560, 3072, 1, 256, 2560, 2560, 256, 3072] - - [8, 27016.5] - - - [3072, 3072, 1, 256, 3072, 3072, 256, 3072] - - [8, 28086.5] - - - [512, 512, 1, 512, 512, 512, 512, 512] - - [26, 6889.5] - - - [1024, 512, 1, 512, 1024, 1024, 512, 512] - - [1, 12085.4] - - - [1536, 512, 1, 512, 1536, 1536, 512, 512] - - [14, 9874.52] - - - [2048, 512, 1, 512, 2048, 2048, 512, 512] - - [6, 17180.4] - - - [2560, 512, 1, 512, 2560, 2560, 512, 512] - - [6, 18934.8] - - - [3072, 512, 1, 512, 3072, 3072, 512, 512] - - [1, 21194.2] - - - [512, 1024, 1, 512, 512, 512, 512, 1024] - - [6, 12156.6] - - - [1024, 1024, 1, 512, 1024, 1024, 512, 1024] - - [21, 17796.6] - - - [1536, 1024, 1, 512, 1536, 1536, 512, 1024] - - [1, 21025.2] - - - [2048, 1024, 1, 512, 2048, 2048, 512, 1024] - - [6, 23354.1] - - - [2560, 1024, 1, 512, 2560, 2560, 512, 1024] - - [21, 24928.3] - - - [3072, 1024, 1, 512, 3072, 3072, 512, 1024] - - [1, 26813.2] - - - [512, 1536, 1, 512, 512, 512, 512, 1536] - - [21, 14516.6] - - - [1024, 1536, 1, 512, 1024, 1024, 512, 1536] - - [6, 21185.9] - - - [1536, 1536, 1, 512, 1536, 1536, 512, 1536] - - [6, 24108.8] - - - [2048, 1536, 1, 512, 2048, 2048, 512, 1536] - - [6, 26635.3] - - - [2560, 1536, 1, 512, 2560, 2560, 512, 1536] - - [21, 27782.2] - - - [3072, 1536, 1, 512, 3072, 3072, 512, 1536] - - [6, 29862.3] - - - [512, 2048, 1, 512, 512, 512, 512, 2048] - - [19, 12205.1] - - - [1024, 2048, 1, 512, 1024, 1024, 512, 2048] - - [6, 23129.8] - - - [1536, 2048, 1, 512, 1536, 1536, 512, 2048] - - [6, 26490.8] - - - [2048, 2048, 1, 512, 2048, 2048, 512, 2048] - - [6, 28155.0] - - - [2560, 2048, 1, 512, 2560, 2560, 512, 2048] - - [1, 30072.6] - - - [3072, 2048, 1, 512, 3072, 3072, 512, 2048] - - [1, 30680.4] - - - [512, 2560, 1, 512, 512, 512, 512, 2560] - - [6, 18757.3] - - - [1024, 2560, 1, 512, 1024, 1024, 512, 2560] - - [6, 19614.2] - - - [1536, 2560, 1, 512, 1536, 1536, 512, 2560] - - [6, 27576.2] - - - [2048, 2560, 1, 512, 2048, 2048, 512, 2560] - - [1, 30077.5] - - - [2560, 2560, 1, 512, 2560, 2560, 512, 2560] - - [6, 31499.1] - - - [3072, 2560, 1, 512, 3072, 3072, 512, 2560] - - [6, 32458.2] - - - [512, 3072, 1, 512, 512, 512, 512, 3072] - - [21, 21039.2] - - - [1024, 3072, 1, 512, 1024, 1024, 512, 3072] - - [1, 26199.0] - - - [1536, 3072, 1, 512, 1536, 1536, 512, 3072] - - [6, 29460.8] - - - [2048, 3072, 1, 512, 2048, 2048, 512, 3072] - - [1, 30827.6] - - - [2560, 3072, 1, 512, 2560, 2560, 512, 3072] - - [1, 31993.0] - - - [3072, 3072, 1, 512, 3072, 3072, 512, 3072] - - [6, 33276.4] - - - [512, 512, 1, 1024, 512, 512, 1024, 512] - - [12, 10971.8] - - - [1024, 512, 1, 1024, 1024, 1024, 1024, 512] - - [21, 18310.7] - - - [1536, 512, 1, 1024, 1536, 1536, 1024, 512] - - [1, 19266.4] - - - [2048, 512, 1, 1024, 2048, 2048, 1024, 512] - - [19, 18428.7] - - - [2560, 512, 1, 1024, 2560, 2560, 1024, 512] - - [1, 24497.7] - - - [3072, 512, 1, 1024, 3072, 3072, 1024, 512] - - [1, 26869.1] - - - [512, 1024, 1, 1024, 512, 512, 1024, 1024] - - [21, 18273.3] - - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] - - [6, 24337.7] - - - [1536, 1024, 1, 1024, 1536, 1536, 1024, 1024] - - [19, 26873.8] - - - [2048, 1024, 1, 1024, 2048, 2048, 1024, 1024] - - [19, 29022.4] - - - [2560, 1024, 1, 1024, 2560, 2560, 1024, 1024] - - [19, 30283.1] - - - [3072, 1024, 1, 1024, 3072, 3072, 1024, 1024] - - [1, 31153.2] - - - [512, 1536, 1, 1024, 512, 512, 1024, 1536] - - [6, 20246.8] - - - [1024, 1536, 1, 1024, 1024, 1024, 1024, 1536] - - [1, 27449.5] - - - [1536, 1536, 1, 1024, 1536, 1536, 1024, 1536] - - [21, 28920.3] - - - [2048, 1536, 1, 1024, 2048, 2048, 1024, 1536] - - [6, 31424.3] - - - [2560, 1536, 1, 1024, 2560, 2560, 1024, 1536] - - [1, 31814.4] - - - [3072, 1536, 1, 1024, 3072, 3072, 1024, 1536] - - [1, 33218.0] - - - [512, 2048, 1, 1024, 512, 512, 1024, 2048] - - [21, 24365.6] - - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 2048] - - [21, 29038.1] - - - [1536, 2048, 1, 1024, 1536, 1536, 1024, 2048] - - [1, 31145.5] - - - [2048, 2048, 1, 1024, 2048, 2048, 1024, 2048] - - [19, 32352.7] - - - [2560, 2048, 1, 1024, 2560, 2560, 1024, 2048] - - [19, 33927.1] - - - [3072, 2048, 1, 1024, 3072, 3072, 1024, 2048] - - [1, 33613.0] - - - [512, 2560, 1, 1024, 512, 512, 1024, 2560] - - [6, 24540.4] - - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 2560] - - [22, 29472.2] - - - [1536, 2560, 1, 1024, 1536, 1536, 1024, 2560] - - [19, 32023.7] - - - [2048, 2560, 1, 1024, 2048, 2048, 1024, 2560] - - [6, 33480.9] - - - [2560, 2560, 1, 1024, 2560, 2560, 1024, 2560] - - [1, 33714.7] - - - [3072, 2560, 1, 1024, 3072, 3072, 1024, 2560] - - [21, 34741.4] - - - [512, 3072, 1, 1024, 512, 512, 1024, 3072] - - [1, 27182.6] - - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 3072] - - [6, 31539.8] - - - [1536, 3072, 1, 1024, 1536, 1536, 1024, 3072] - - [21, 33705.5] - - - [2048, 3072, 1, 1024, 2048, 2048, 1024, 3072] - - [19, 34361.0] - - - [2560, 3072, 1, 1024, 2560, 2560, 1024, 3072] - - [19, 35487.5] - - - [3072, 3072, 1, 1024, 3072, 3072, 1024, 3072] - - [8, 35326.3] - - - [512, 512, 1, 2048, 512, 512, 2048, 512] - - [28, 15679.9] - - - [1024, 512, 1, 2048, 1024, 1024, 2048, 512] - - [21, 24296.6] - - - [1536, 512, 1, 2048, 1536, 1536, 2048, 512] - - [21, 24278.2] - - - [2048, 512, 1, 2048, 2048, 2048, 2048, 512] - - [6, 29739.2] - - - [2560, 512, 1, 2048, 2560, 2560, 2048, 512] - - [6, 28423.0] - - - [3072, 512, 1, 2048, 3072, 3072, 2048, 512] - - [21, 31795.6] - - - [512, 1024, 1, 2048, 512, 512, 2048, 1024] - - [21, 24437.9] - - - [1024, 1024, 1, 2048, 1024, 1024, 2048, 1024] - - [6, 29380.4] - - - [1536, 1024, 1, 2048, 1536, 1536, 2048, 1024] - - [21, 31789.3] - - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 1024] - - [20, 31191.0] - - - [2560, 1024, 1, 2048, 2560, 2560, 2048, 1024] - - [21, 33912.0] - - - [3072, 1024, 1, 2048, 3072, 3072, 2048, 1024] - - [6, 35255.0] - - - [512, 1536, 1, 2048, 512, 512, 2048, 1536] - - [6, 24873.6] - - - [1024, 1536, 1, 2048, 1024, 1024, 2048, 1536] - - [6, 31882.3] - - - [1536, 1536, 1, 2048, 1536, 1536, 2048, 1536] - - [21, 31570.7] - - - [2048, 1536, 1, 2048, 2048, 2048, 2048, 1536] - - [21, 34462.4] - - - [2560, 1536, 1, 2048, 2560, 2560, 2048, 1536] - - [21, 34598.5] - - - [3072, 1536, 1, 2048, 3072, 3072, 2048, 1536] - - [6, 35573.9] - - - [512, 2048, 1, 2048, 512, 512, 2048, 2048] - - [6, 29667.1] - - - [1024, 2048, 1, 2048, 1024, 1024, 2048, 2048] - - [6, 33209.9] - - - [1536, 2048, 1, 2048, 1536, 1536, 2048, 2048] - - [6, 34438.3] - - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] - - [6, 35818.1] - - - [2560, 2048, 1, 2048, 2560, 2560, 2048, 2048] - - [6, 36040.1] - - - [3072, 2048, 1, 2048, 3072, 3072, 2048, 2048] - - [6, 36782.1] - - - [512, 2560, 1, 2048, 512, 512, 2048, 2560] - - [22, 27572.0] - - - [1024, 2560, 1, 2048, 1024, 1024, 2048, 2560] - - [1, 33165.8] - - - [1536, 2560, 1, 2048, 1536, 1536, 2048, 2560] - - [6, 34165.4] - - - [2048, 2560, 1, 2048, 2048, 2048, 2048, 2560] - - [6, 35669.4] - - - [2560, 2560, 1, 2048, 2560, 2560, 2048, 2560] - - [21, 35789.5] - - - [3072, 2560, 1, 2048, 3072, 3072, 2048, 2560] - - [6, 36272.2] - - - [512, 3072, 1, 2048, 512, 512, 2048, 3072] - - [19, 31784.6] - - - [1024, 3072, 1, 2048, 1024, 1024, 2048, 3072] - - [6, 35267.6] - - - [1536, 3072, 1, 2048, 1536, 1536, 2048, 3072] - - [6, 35707.6] - - - [2048, 3072, 1, 2048, 2048, 2048, 2048, 3072] - - [21, 35920.7] - - - [2560, 3072, 1, 2048, 2560, 2560, 2048, 3072] - - [21, 36186.1] - - - [3072, 3072, 1, 2048, 3072, 3072, 2048, 3072] - - [6, 36613.0] - - - [512, 512, 1, 3072, 512, 512, 3072, 512] - - [28, 18056.6] - - - [1024, 512, 1, 3072, 1024, 1024, 3072, 512] - - [6, 27553.0] - - - [1536, 512, 1, 3072, 1536, 1536, 3072, 512] - - [19, 26363.9] - - - [2048, 512, 1, 3072, 2048, 2048, 3072, 512] - - [6, 32272.8] - - - [2560, 512, 1, 3072, 2560, 2560, 3072, 512] - - [19, 30756.5] - - - [3072, 512, 1, 3072, 3072, 3072, 3072, 512] - - [21, 33875.9] - - - [512, 1024, 1, 3072, 512, 512, 3072, 1024] - - [21, 27610.0] - - - [1024, 1024, 1, 3072, 1024, 1024, 3072, 1024] - - [21, 31679.6] - - - [1536, 1024, 1, 3072, 1536, 1536, 3072, 1024] - - [1, 33794.0] - - - [2048, 1024, 1, 3072, 2048, 2048, 3072, 1024] - - [6, 34582.0] - - - [2560, 1024, 1, 3072, 2560, 2560, 3072, 1024] - - [6, 34984.0] - - - [3072, 1024, 1, 3072, 3072, 3072, 3072, 1024] - - [6, 36031.0] - - - [512, 1536, 1, 3072, 512, 512, 3072, 1536] - - [6, 23558.9] - - - [1024, 1536, 1, 3072, 1024, 1024, 3072, 1536] - - [1, 33291.3] - - - [1536, 1536, 1, 3072, 1536, 1536, 3072, 1536] - - [8, 33039.3] - - - [2048, 1536, 1, 3072, 2048, 2048, 3072, 1536] - - [21, 35348.4] - - - [2560, 1536, 1, 3072, 2560, 2560, 3072, 1536] - - [1, 35120.0] - - - [3072, 1536, 1, 3072, 3072, 3072, 3072, 1536] - - [21, 35699.4] - - - [512, 2048, 1, 3072, 512, 512, 3072, 2048] - - [1, 32285.7] - - - [1024, 2048, 1, 3072, 1024, 1024, 3072, 2048] - - [6, 34666.7] - - - [1536, 2048, 1, 3072, 1536, 1536, 3072, 2048] - - [6, 35985.3] - - - [2048, 2048, 1, 3072, 2048, 2048, 3072, 2048] - - [21, 35977.0] - - - [2560, 2048, 1, 3072, 2560, 2560, 3072, 2048] - - [6, 37045.4] - - - [3072, 2048, 1, 3072, 3072, 3072, 3072, 2048] - - [1, 36987.5] - - - [512, 2560, 1, 3072, 512, 512, 3072, 2560] - - [19, 29945.3] - - - [1024, 2560, 1, 3072, 1024, 1024, 3072, 2560] - - [19, 35250.2] - - - [1536, 2560, 1, 3072, 1536, 1536, 3072, 2560] - - [6, 35889.4] - - - [2048, 2560, 1, 3072, 2048, 2048, 3072, 2560] - - [21, 36970.0] - - - [2560, 2560, 1, 3072, 2560, 2560, 3072, 2560] - - [8, 36851.2] - - - [3072, 2560, 1, 3072, 3072, 3072, 3072, 2560] - - [21, 36784.3] - - - [512, 3072, 1, 3072, 512, 512, 3072, 3072] - - [6, 33508.8] - - - [1024, 3072, 1, 3072, 1024, 1024, 3072, 3072] - - [19, 36071.5] - - - [1536, 3072, 1, 3072, 1536, 1536, 3072, 3072] - - [6, 36754.1] - - - [2048, 3072, 1, 3072, 2048, 2048, 3072, 3072] - - [6, 37061.7] - - - [2560, 3072, 1, 3072, 2560, 2560, 3072, 3072] - - [6, 36768.6] - - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] - - [6, 37017.4] - - - [1, 1, 1, 1, 1, 1, 1, 1] - - [15, 6.66533e-05] - - - [1, 1, 1, 64, 1, 1, 64, 1] - - [15, 0.00412252] - - - [1, 64, 1, 1, 1, 1, 1, 64] - - [16, 0.00426852] - - - [64, 1, 1, 1, 64, 64, 1, 1] - - [25, 0.00407916] - - - [64, 64, 1, 1, 64, 64, 1, 64] - - [17, 0.266164] - - - [64, 1, 1, 64, 64, 64, 64, 1] - - [24, 0.271564] - - - [1, 64, 1, 64, 1, 1, 64, 64] - - [18, 0.270658] - - - [64, 64, 1, 64, 64, 64, 64, 64] - - [11, 17.2707] - - - [64, 64, 1, 256, 64, 64, 256, 64] - - [11, 63.0286] - - - [64, 64, 1, 512, 64, 64, 512, 64] - - [10, 115.644] - - - [64, 64, 1, 1024, 64, 64, 1024, 64] - - [24, 203.124] - - - [64, 64, 1, 2048, 64, 64, 2048, 64] - - [10, 317.817] - - - [64, 64, 1, 4096, 64, 64, 4096, 64] - - [10, 442.484] -- null -- null -- DeviceEfficiency -- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Alik_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Alik_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml deleted file mode 100644 index 9d592a746df..00000000000 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Alik_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml +++ /dev/null @@ -1,9086 +0,0 @@ -- {MinimumRequiredVersion: 4.33.0} -- gfx1200 -- gfx1200 -- [Device 73f0] -- Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false -- - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 0 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 7 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 8 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 9 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 10 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR3_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 11 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 12 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG64_2_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 13 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG64_2_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [64, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 14 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 15 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 16 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 17 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 18 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 19 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 20 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 9472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 9472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 21 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 9472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 9472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 22 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 23 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR1_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 24 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 25 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 26 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR3_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 27 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 28 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 31744 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 26624 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 26624 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 29 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR3_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 31744 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 26624 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 26624 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 30 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR3_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 31744 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 10240 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 31 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 31744 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 10240 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 32 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 -- [2, 3, 0, 1] -- - - [512, 512, 1, 64, 512, 512, 64, 64] - - [26, 1116.36] - - - [1024, 512, 1, 64, 1024, 1024, 64, 64] - - [24, 2109.68] - - - [1536, 512, 1, 64, 1536, 1536, 64, 64] - - [6, 2920.74] - - - [2048, 512, 1, 64, 2048, 2048, 64, 64] - - [24, 3812.35] - - - [2560, 512, 1, 64, 2560, 2560, 64, 64] - - [9, 4280.56] - - - [3072, 512, 1, 64, 3072, 3072, 64, 64] - - [7, 4977.66] - - - [512, 1024, 1, 64, 512, 512, 64, 64] - - [20, 2015.7] - - - [1024, 1024, 1, 64, 1024, 1024, 64, 64] - - [5, 3695.63] - - - [1536, 1024, 1, 64, 1536, 1536, 64, 64] - - [6, 5176.29] - - - [2048, 1024, 1, 64, 2048, 2048, 64, 64] - - [26, 5979.18] - - - [2560, 1024, 1, 64, 2560, 2560, 64, 64] - - [7, 7139.54] - - - [3072, 1024, 1, 64, 3072, 3072, 64, 64] - - [7, 8016.99] - - - [512, 1536, 1, 64, 512, 512, 64, 64] - - [4, 2992.1] - - - [1024, 1536, 1, 64, 1024, 1024, 64, 64] - - [8, 4945.87] - - - [1536, 1536, 1, 64, 1536, 1536, 64, 64] - - [23, 6493.29] - - - [2048, 1536, 1, 64, 2048, 2048, 64, 64] - - [3, 8039.4] - - - [2560, 1536, 1, 64, 2560, 2560, 64, 64] - - [3, 9563.3] - - - [3072, 1536, 1, 64, 3072, 3072, 64, 64] - - [7, 10179.8] - - - [512, 2048, 1, 64, 512, 512, 64, 64] - - [9, 3660.25] - - - [1024, 2048, 1, 64, 1024, 1024, 64, 64] - - [3, 6320.59] - - - [1536, 2048, 1, 64, 1536, 1536, 64, 64] - - [3, 8217.08] - - - [2048, 2048, 1, 64, 2048, 2048, 64, 64] - - [3, 9524.73] - - - [2560, 2048, 1, 64, 2560, 2560, 64, 64] - - [3, 11184.3] - - - [3072, 2048, 1, 64, 3072, 3072, 64, 64] - - [3, 11892.1] - - - [512, 2560, 1, 64, 512, 512, 64, 64] - - [4, 4524.6] - - - [1024, 2560, 1, 64, 1024, 1024, 64, 64] - - [3, 7363.92] - - - [1536, 2560, 1, 64, 1536, 1536, 64, 64] - - [7, 9451.78] - - - [2048, 2560, 1, 64, 2048, 2048, 64, 64] - - [3, 11067.7] - - - [2560, 2560, 1, 64, 2560, 2560, 64, 64] - - [4, 12345.4] - - - [3072, 2560, 1, 64, 3072, 3072, 64, 64] - - [3, 13638.3] - - - [512, 3072, 1, 64, 512, 512, 64, 64] - - [5, 5016.11] - - - [1024, 3072, 1, 64, 1024, 1024, 64, 64] - - [6, 7863.24] - - - [1536, 3072, 1, 64, 1536, 1536, 64, 64] - - [8, 10419.2] - - - [2048, 3072, 1, 64, 2048, 2048, 64, 64] - - [7, 12251.0] - - - [2560, 3072, 1, 64, 2560, 2560, 64, 64] - - [8, 13673.5] - - - [3072, 3072, 1, 64, 3072, 3072, 64, 64] - - [3, 14721.3] - - - [512, 512, 1, 256, 512, 512, 256, 256] - - [23, 3824.3] - - - [1024, 512, 1, 256, 1024, 1024, 256, 256] - - [21, 6310.04] - - - [1536, 512, 1, 256, 1536, 1536, 256, 256] - - [3, 9379.08] - - - [2048, 512, 1, 256, 2048, 2048, 256, 256] - - [7, 11175.5] - - - [2560, 512, 1, 256, 2560, 2560, 256, 256] - - [3, 12621.6] - - - [3072, 512, 1, 256, 3072, 3072, 256, 256] - - [7, 14480.3] - - - [512, 1024, 1, 256, 512, 512, 256, 256] - - [7, 7320.5] - - - [1024, 1024, 1, 256, 1024, 1024, 256, 256] - - [7, 11689.9] - - - [1536, 1024, 1, 256, 1536, 1536, 256, 256] - - [32, 11671.1] - - - [2048, 1024, 1, 256, 2048, 2048, 256, 256] - - [7, 16930.9] - - - [2560, 1024, 1, 256, 2560, 2560, 256, 256] - - [3, 18384.2] - - - [3072, 1024, 1, 256, 3072, 3072, 256, 256] - - [3, 20444.7] - - - [512, 1536, 1, 256, 512, 512, 256, 256] - - [3, 9420.77] - - - [1024, 1536, 1, 256, 1024, 1024, 256, 256] - - [7, 14561.4] - - - [1536, 1536, 1, 256, 1536, 1536, 256, 256] - - [7, 17782.7] - - - [2048, 1536, 1, 256, 2048, 2048, 256, 256] - - [7, 20398.1] - - - [2560, 1536, 1, 256, 2560, 2560, 256, 256] - - [7, 22130.6] - - - [3072, 1536, 1, 256, 3072, 3072, 256, 256] - - [7, 23282.6] - - - [512, 2048, 1, 256, 512, 512, 256, 256] - - [23, 11511.4] - - - [1024, 2048, 1, 256, 1024, 1024, 256, 256] - - [7, 17019.7] - - - [1536, 2048, 1, 256, 1536, 1536, 256, 256] - - [7, 20531.0] - - - [2048, 2048, 1, 256, 2048, 2048, 256, 256] - - [7, 22318.2] - - - [2560, 2048, 1, 256, 2560, 2560, 256, 256] - - [3, 24464.2] - - - [3072, 2048, 1, 256, 3072, 3072, 256, 256] - - [7, 26127.0] - - - [512, 2560, 1, 256, 512, 512, 256, 256] - - [23, 12787.8] - - - [1024, 2560, 1, 256, 1024, 1024, 256, 256] - - [23, 17516.4] - - - [1536, 2560, 1, 256, 1536, 1536, 256, 256] - - [7, 22099.0] - - - [2048, 2560, 1, 256, 2048, 2048, 256, 256] - - [3, 24493.2] - - - [2560, 2560, 1, 256, 2560, 2560, 256, 256] - - [23, 26039.0] - - - [3072, 2560, 1, 256, 3072, 3072, 256, 256] - - [7, 27133.3] - - - [512, 3072, 1, 256, 512, 512, 256, 256] - - [31, 11673.0] - - - [1024, 3072, 1, 256, 1024, 1024, 256, 256] - - [7, 20336.3] - - - [1536, 3072, 1, 256, 1536, 1536, 256, 256] - - [7, 23677.8] - - - [2048, 3072, 1, 256, 2048, 2048, 256, 256] - - [23, 25872.3] - - - [2560, 3072, 1, 256, 2560, 2560, 256, 256] - - [3, 27085.7] - - - [3072, 3072, 1, 256, 3072, 3072, 256, 256] - - [3, 28148.9] - - - [512, 512, 1, 512, 512, 512, 512, 512] - - [29, 6901.9] - - - [1024, 512, 1, 512, 1024, 1024, 512, 512] - - [22, 10023.5] - - - [1536, 512, 1, 512, 1536, 1536, 512, 512] - - [3, 14241.6] - - - [2048, 512, 1, 512, 2048, 2048, 512, 512] - - [7, 17210.7] - - - [2560, 512, 1, 512, 2560, 2560, 512, 512] - - [7, 18333.8] - - - [3072, 512, 1, 512, 3072, 3072, 512, 512] - - [18, 20924.1] - - - [512, 1024, 1, 512, 512, 512, 512, 512] - - [18, 11163.9] - - - [1024, 1024, 1, 512, 1024, 1024, 512, 512] - - [3, 17257.7] - - - [1536, 1024, 1, 512, 1536, 1536, 512, 512] - - [3, 21191.4] - - - [2048, 1024, 1, 512, 2048, 2048, 512, 512] - - [3, 23518.1] - - - [2560, 1024, 1, 512, 2560, 2560, 512, 512] - - [7, 24712.4] - - - [3072, 1024, 1, 512, 3072, 3072, 512, 512] - - [3, 26455.7] - - - [512, 1536, 1, 512, 512, 512, 512, 512] - - [7, 14173.7] - - - [1024, 1536, 1, 512, 1024, 1024, 512, 512] - - [3, 21272.9] - - - [1536, 1536, 1, 512, 1536, 1536, 512, 512] - - [7, 23413.7] - - - [2048, 1536, 1, 512, 2048, 2048, 512, 512] - - [7, 26629.0] - - - [2560, 1536, 1, 512, 2560, 2560, 512, 512] - - [3, 27606.6] - - - [3072, 1536, 1, 512, 3072, 3072, 512, 512] - - [3, 29315.8] - - - [512, 2048, 1, 512, 512, 512, 512, 512] - - [3, 17921.7] - - - [1024, 2048, 1, 512, 1024, 1024, 512, 512] - - [7, 23634.6] - - - [1536, 2048, 1, 512, 1536, 1536, 512, 512] - - [3, 26315.1] - - - [2048, 2048, 1, 512, 2048, 2048, 512, 512] - - [3, 28469.1] - - - [2560, 2048, 1, 512, 2560, 2560, 512, 512] - - [18, 30072.4] - - - [3072, 2048, 1, 512, 3072, 3072, 512, 512] - - [23, 31016.6] - - - [512, 2560, 1, 512, 512, 512, 512, 512] - - [3, 18699.8] - - - [1024, 2560, 1, 512, 1024, 1024, 512, 512] - - [3, 24838.3] - - - [1536, 2560, 1, 512, 1536, 1536, 512, 512] - - [3, 27568.4] - - - [2048, 2560, 1, 512, 2048, 2048, 512, 512] - - [3, 30484.8] - - - [2560, 2560, 1, 512, 2560, 2560, 512, 512] - - [7, 31541.9] - - - [3072, 2560, 1, 512, 3072, 3072, 512, 512] - - [23, 31976.4] - - - [512, 3072, 1, 512, 512, 512, 512, 512] - - [25, 20206.2] - - - [1024, 3072, 1, 512, 1024, 1024, 512, 512] - - [3, 26139.5] - - - [1536, 3072, 1, 512, 1536, 1536, 512, 512] - - [7, 29248.4] - - - [2048, 3072, 1, 512, 2048, 2048, 512, 512] - - [3, 30635.2] - - - [2560, 3072, 1, 512, 2560, 2560, 512, 512] - - [7, 31928.0] - - - [3072, 3072, 1, 512, 3072, 3072, 512, 512] - - [3, 32401.2] - - - [512, 512, 1, 1024, 512, 512, 1024, 1024] - - [16, 11191.8] - - - [1024, 512, 1, 1024, 1024, 1024, 1024, 1024] - - [30, 15094.6] - - - [1536, 512, 1, 1024, 1536, 1536, 1024, 1024] - - [7, 20115.1] - - - [2048, 512, 1, 1024, 2048, 2048, 1024, 1024] - - [1, 18401.7] - - - [2560, 512, 1, 1024, 2560, 2560, 1024, 1024] - - [3, 24437.0] - - - [3072, 512, 1, 1024, 3072, 3072, 1024, 1024] - - [3, 27036.4] - - - [512, 1024, 1, 1024, 512, 512, 1024, 1024] - - [23, 18759.2] - - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] - - [7, 23930.3] - - - [1536, 1024, 1, 1024, 1536, 1536, 1024, 1024] - - [3, 27311.9] - - - [2048, 1024, 1, 1024, 2048, 2048, 1024, 1024] - - [3, 29234.0] - - - [2560, 1024, 1, 1024, 2560, 2560, 1024, 1024] - - [18, 30887.0] - - - [3072, 1024, 1, 1024, 3072, 3072, 1024, 1024] - - [3, 31271.5] - - - [512, 1536, 1, 1024, 512, 512, 1024, 1024] - - [3, 15196.0] - - - [1024, 1536, 1, 1024, 1024, 1024, 1024, 1024] - - [7, 27468.2] - - - [1536, 1536, 1, 1024, 1536, 1536, 1024, 1024] - - [9, 28538.7] - - - [2048, 1536, 1, 1024, 2048, 2048, 1024, 1024] - - [3, 31260.8] - - - [2560, 1536, 1, 1024, 2560, 2560, 1024, 1024] - - [18, 31617.8] - - - [3072, 1536, 1, 1024, 3072, 3072, 1024, 1024] - - [3, 33265.9] - - - [512, 2048, 1, 1024, 512, 512, 1024, 1024] - - [7, 24437.9] - - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 1024] - - [23, 29042.0] - - - [1536, 2048, 1, 1024, 1536, 1536, 1024, 1024] - - [3, 31309.3] - - - [2048, 2048, 1, 1024, 2048, 2048, 1024, 1024] - - [3, 32573.8] - - - [2560, 2048, 1, 1024, 2560, 2560, 1024, 1024] - - [7, 33352.6] - - - [3072, 2048, 1, 1024, 3072, 3072, 1024, 1024] - - [18, 33978.9] - - - [512, 2560, 1, 1024, 512, 512, 1024, 1024] - - [3, 24450.6] - - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 1024] - - [7, 30538.4] - - - [1536, 2560, 1, 1024, 1536, 1536, 1024, 1024] - - [18, 31676.3] - - - [2048, 2560, 1, 1024, 2048, 2048, 1024, 1024] - - [18, 33474.4] - - - [2560, 2560, 1, 1024, 2560, 2560, 1024, 1024] - - [25, 34230.7] - - - [3072, 2560, 1, 1024, 3072, 3072, 1024, 1024] - - [18, 34591.6] - - - [512, 3072, 1, 1024, 512, 512, 1024, 1024] - - [7, 27123.1] - - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 1024] - - [25, 30137.2] - - - [1536, 3072, 1, 1024, 1536, 1536, 1024, 1024] - - [18, 33263.7] - - - [2048, 3072, 1, 1024, 2048, 2048, 1024, 1024] - - [3, 33801.1] - - - [2560, 3072, 1, 1024, 2560, 2560, 1024, 1024] - - [3, 35197.0] - - - [3072, 3072, 1, 1024, 3072, 3072, 1024, 1024] - - [3, 35739.8] - - - [512, 512, 1, 2048, 512, 512, 2048, 2048] - - [0, 16070.1] - - - [1024, 512, 1, 2048, 1024, 1024, 2048, 2048] - - [7, 24839.9] - - - [1536, 512, 1, 2048, 1536, 1536, 2048, 2048] - - [7, 24644.6] - - - [2048, 512, 1, 2048, 2048, 2048, 2048, 2048] - - [23, 29724.8] - - - [2560, 512, 1, 2048, 2560, 2560, 2048, 2048] - - [7, 28576.3] - - - [3072, 512, 1, 2048, 3072, 3072, 2048, 2048] - - [7, 31765.6] - - - [512, 1024, 1, 2048, 512, 512, 2048, 2048] - - [30, 19501.7] - - - [1024, 1024, 1, 2048, 1024, 1024, 2048, 2048] - - [3, 29849.0] - - - [1536, 1024, 1, 2048, 1536, 1536, 2048, 2048] - - [7, 31696.9] - - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 2048] - - [7, 33441.7] - - - [2560, 1024, 1, 2048, 2560, 2560, 2048, 2048] - - [18, 33752.7] - - - [3072, 1024, 1, 2048, 3072, 3072, 2048, 2048] - - [7, 34739.8] - - - [512, 1536, 1, 2048, 512, 512, 2048, 2048] - - [7, 25223.0] - - - [1024, 1536, 1, 2048, 1024, 1024, 2048, 2048] - - [3, 32174.0] - - - [1536, 1536, 1, 2048, 1536, 1536, 2048, 2048] - - [23, 31646.4] - - - [2048, 1536, 1, 2048, 2048, 2048, 2048, 2048] - - [3, 34694.8] - - - [2560, 1536, 1, 2048, 2560, 2560, 2048, 2048] - - [3, 34553.2] - - - [3072, 1536, 1, 2048, 3072, 3072, 2048, 2048] - - [23, 35759.2] - - - [512, 2048, 1, 2048, 512, 512, 2048, 2048] - - [3, 29652.8] - - - [1024, 2048, 1, 2048, 1024, 1024, 2048, 2048] - - [7, 33297.5] - - - [1536, 2048, 1, 2048, 1536, 1536, 2048, 2048] - - [7, 34608.8] - - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] - - [23, 35139.7] - - - [2560, 2048, 1, 2048, 2560, 2560, 2048, 2048] - - [23, 35341.3] - - - [3072, 2048, 1, 2048, 3072, 3072, 2048, 2048] - - [23, 35956.2] - - - [512, 2560, 1, 2048, 512, 512, 2048, 2048] - - [3, 28258.1] - - - [1024, 2560, 1, 2048, 1024, 1024, 2048, 2048] - - [7, 33681.7] - - - [1536, 2560, 1, 2048, 1536, 1536, 2048, 2048] - - [3, 34214.0] - - - [2048, 2560, 1, 2048, 2048, 2048, 2048, 2048] - - [3, 35566.9] - - - [2560, 2560, 1, 2048, 2560, 2560, 2048, 2048] - - [3, 34975.4] - - - [3072, 2560, 1, 2048, 3072, 3072, 2048, 2048] - - [7, 36457.3] - - - [512, 3072, 1, 2048, 512, 512, 2048, 2048] - - [23, 32028.3] - - - [1024, 3072, 1, 2048, 1024, 1024, 2048, 2048] - - [3, 35102.8] - - - [1536, 3072, 1, 2048, 1536, 1536, 2048, 2048] - - [23, 35622.4] - - - [2048, 3072, 1, 2048, 2048, 2048, 2048, 2048] - - [23, 36506.3] - - - [2560, 3072, 1, 2048, 2560, 2560, 2048, 2048] - - [23, 36578.4] - - - [3072, 3072, 1, 2048, 3072, 3072, 2048, 2048] - - [7, 36227.1] - - - [512, 512, 1, 3072, 512, 512, 3072, 3072] - - [16, 19097.1] - - - [1024, 512, 1, 3072, 1024, 1024, 3072, 3072] - - [7, 22485.2] - - - [1536, 512, 1, 3072, 1536, 1536, 3072, 3072] - - [18, 27063.8] - - - [2048, 512, 1, 3072, 2048, 2048, 3072, 3072] - - [23, 31945.5] - - - [2560, 512, 1, 3072, 2560, 2560, 3072, 3072] - - [25, 30201.7] - - - [3072, 512, 1, 3072, 3072, 3072, 3072, 3072] - - [7, 33670.1] - - - [512, 1024, 1, 3072, 512, 512, 3072, 3072] - - [23, 27764.6] - - - [1024, 1024, 1, 3072, 1024, 1024, 3072, 3072] - - [18, 31934.4] - - - [1536, 1024, 1, 3072, 1536, 1536, 3072, 3072] - - [18, 33879.6] - - - [2048, 1024, 1, 3072, 2048, 2048, 3072, 3072] - - [7, 34652.8] - - - [2560, 1024, 1, 3072, 2560, 2560, 3072, 3072] - - [7, 35678.7] - - - [3072, 1024, 1, 3072, 3072, 3072, 3072, 3072] - - [7, 36476.6] - - - [512, 1536, 1, 3072, 512, 512, 3072, 3072] - - [19, 26411.6] - - - [1024, 1536, 1, 3072, 1024, 1024, 3072, 3072] - - [18, 33941.6] - - - [1536, 1536, 1, 3072, 1536, 1536, 3072, 3072] - - [9, 33518.4] - - - [2048, 1536, 1, 3072, 2048, 2048, 3072, 3072] - - [7, 35550.4] - - - [2560, 1536, 1, 3072, 2560, 2560, 3072, 3072] - - [19, 35037.3] - - - [3072, 1536, 1, 3072, 3072, 3072, 3072, 3072] - - [7, 36761.4] - - - [512, 2048, 1, 3072, 512, 512, 3072, 3072] - - [23, 32025.2] - - - [1024, 2048, 1, 3072, 1024, 1024, 3072, 3072] - - [23, 34942.8] - - - [1536, 2048, 1, 3072, 1536, 1536, 3072, 3072] - - [7, 36511.0] - - - [2048, 2048, 1, 3072, 2048, 2048, 3072, 3072] - - [7, 36512.1] - - - [2560, 2048, 1, 3072, 2560, 2560, 3072, 3072] - - [3, 36582.2] - - - [3072, 2048, 1, 3072, 3072, 3072, 3072, 3072] - - [3, 36507.9] - - - [512, 2560, 1, 3072, 512, 512, 3072, 3072] - - [9, 30497.2] - - - [1024, 2560, 1, 3072, 1024, 1024, 3072, 3072] - - [23, 35525.0] - - - [1536, 2560, 1, 3072, 1536, 1536, 3072, 3072] - - [9, 35295.7] - - - [2048, 2560, 1, 3072, 2048, 2048, 3072, 3072] - - [23, 36415.3] - - - [2560, 2560, 1, 3072, 2560, 2560, 3072, 3072] - - [3, 36079.3] - - - [3072, 2560, 1, 3072, 3072, 3072, 3072, 3072] - - [7, 36791.5] - - - [512, 3072, 1, 3072, 512, 512, 3072, 3072] - - [23, 33847.3] - - - [1024, 3072, 1, 3072, 1024, 1024, 3072, 3072] - - [3, 35738.6] - - - [1536, 3072, 1, 3072, 1536, 1536, 3072, 3072] - - [23, 36281.0] - - - [2048, 3072, 1, 3072, 2048, 2048, 3072, 3072] - - [3, 36700.7] - - - [2560, 3072, 1, 3072, 2560, 2560, 3072, 3072] - - [3, 36778.3] - - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] - - [3, 37457.0] - - - [1, 1, 1, 1, 1, 1, 1, 1] - - [17, 6.65624e-05] - - - [1, 1, 1, 64, 1, 1, 64, 64] - - [21, 0.00424868] - - - [1, 64, 1, 1, 1, 1, 1, 1] - - [13, 0.00424305] - - - [64, 1, 1, 1, 64, 64, 1, 1] - - [15, 0.00427565] - - - [64, 64, 1, 1, 64, 64, 1, 1] - - [2, 0.269944] - - - [64, 1, 1, 64, 64, 64, 64, 64] - - [10, 0.250505] - - - [1, 64, 1, 64, 1, 1, 64, 64] - - [28, 0.271195] - - - [64, 64, 1, 64, 64, 64, 64, 64] - - [12, 17.4611] - - - [64, 64, 1, 256, 64, 64, 256, 256] - - [11, 64.9052] - - - [64, 64, 1, 512, 64, 64, 512, 512] - - [14, 118.493] - - - [64, 64, 1, 1024, 64, 64, 1024, 1024] - - [11, 203.869] - - - [64, 64, 1, 2048, 64, 64, 2048, 2048] - - [11, 322.713] - - - [64, 64, 1, 4096, 64, 64, 4096, 4096] - - [27, 447.864] -- null -- null -- DeviceEfficiency -- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml new file mode 100644 index 00000000000..64178fba7f5 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -0,0 +1,389 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx1201 +- gfx1201 +- [Device 73f0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAnKrZLw0Nlt1oQtI32D3BKNxykE_z_QRF1ZHflDCgI_Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 12800 + LdsInitCVgprs: false + LdsNumBytes: 12800 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2304 + LdsOffsetMetadata_Blk: 10496 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [128, 128, 1, 128, 128, 128, 128, 128] + - [0, 0.08] +- null +- null +- DeviceEfficiency +- Equality diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml new file mode 100644 index 00000000000..eba2ec731d0 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -0,0 +1,389 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx1201 +- gfx1201 +- [Device 73f0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAvPbW1EldmDkYG10epNpkVm-irjGzqxm08jz6SByNeQs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6400 + LdsInitCVgprs: false + LdsNumBytes: 6400 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 5248 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1152 + LdsOffsetMetadata_Blk: 5248 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [128, 128, 1, 128, 128, 128, 128, 128] + - [0, 0.08] +- null +- null +- DeviceEfficiency +- Equality diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml new file mode 100644 index 00000000000..8e40467e7f4 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -0,0 +1,389 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx1201 +- gfx1201 +- [Device 73f0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAoP3i3lF397797qrIx8MG_jBevK8ik06WqHbC1LQ2U58= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB1_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6656 + LdsInitCVgprs: false + LdsNumBytes: 6656 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 16 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [128, 128, 1, 128, 128, 128, 128, 128] + - [0, 0.08] +- null +- null +- DeviceEfficiency +- Equality diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml new file mode 100644 index 00000000000..213cebe1154 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -0,0 +1,389 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx1201 +- gfx1201 +- [Device 73f0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAaABph6osUKON-kGZ0d9XYIH3_vXdZUYgjn6Qfg8R2qw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 13312 + LdsInitCVgprs: false + LdsNumBytes: 13312 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2560 + LdsOffsetMetadata_Blk: 10752 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [128, 128, 1, 128, 128, 128, 128, 128] + - [0, 0.08] +- null +- null +- DeviceEfficiency +- Equality diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml deleted file mode 100644 index cbf5e9b7606..00000000000 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml +++ /dev/null @@ -1,7014 +0,0 @@ -- {MinimumRequiredVersion: 4.33.0} -- gfx1201 -- gfx1201 -- [Device 73f0] -- Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false -- - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR0_LBSPPA1024_LBSPPB256_MIWT1_1_PGR1_PLR0_SS0_SVW8_WG64_2_1 - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13568 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 0 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA1024_LBSPPB256_MIWT1_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_WG64_2_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [64, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR1_SS0_SVW8_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR1_SS0_SVW8_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR0_SS0_SVW8_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR0_SS0_SVW8_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR0_SS0_SU32_SUM0_SUS256_SVW8_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS0_SVW8_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR2_PLR1_SS0_SVW8_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS0_SVW8_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 7 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_LBSPPA512_LBSPPB1024_MIWT2_1_PGR2_PLR0_SS0_SVW8_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 8 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR2_PLR0_SS0_SU32_SUM0_SUS256_SVW8_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB512_MIWT1_1_PGR1_PLR1_SS0_SVW8_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 9 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB512_MIWT1_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR3_SS0_SVW8_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29184 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 10 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR3_SS0_SVW8_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29184 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 11 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB256_MIWT1_1_PGR2_PLR1_SS1_SVW1_WG64_2_1 - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13568 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 12 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB256_MIWT1_1_PGR2_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG64_2_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [64, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR0_LBSPPA1024_LBSPPB256_MIWT1_1_PGR2_PLR0_SS1_SVW1_WG64_2_1 - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13568 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 13 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA1024_LBSPPB256_MIWT1_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_WG64_2_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [64, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SVW1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12544 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 14 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_LBSPPA512_LBSPPB512_MIWT1_1_PGR2_PLR0_SS1_SVW1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12544 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 15 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA512_LBSPPB512_MIWT1_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SVW1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 16 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR1_SS1_SVW1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 17 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR0_SS1_SVW1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 18 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS1_SVW1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 19 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS1_SVW1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 20 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SVW1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 21 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SVW1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29184 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 22 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR3_SS1_SVW1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29184 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 23 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR3_SS1_SVW1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29184 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 24 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 -- [2, 3, 0, 1] -- - - [512, 512, 1, 64, 512, 512, 512, 512] - - [18, 1099.89] - - - [1024, 512, 1, 64, 1024, 1024, 1024, 512] - - [6, 2144.12] - - - [1536, 512, 1, 64, 1536, 1536, 1536, 512] - - [7, 2764.79] - - - [2048, 512, 1, 64, 2048, 2048, 2048, 512] - - [19, 3519.82] - - - [2560, 512, 1, 64, 2560, 2560, 2560, 512] - - [8, 4377.96] - - - [3072, 512, 1, 64, 3072, 3072, 3072, 512] - - [6, 5057.82] - - - [512, 1024, 1, 64, 512, 512, 512, 1024] - - [7, 2087.37] - - - [1024, 1024, 1, 64, 1024, 1024, 1024, 1024] - - [6, 3734.7] - - - [1536, 1024, 1, 64, 1536, 1536, 1536, 1024] - - [5, 4845.64] - - - [2048, 1024, 1, 64, 2048, 2048, 2048, 1024] - - [8, 6096.37] - - - [2560, 1024, 1, 64, 2560, 2560, 2560, 1024] - - [5, 7148.67] - - - [3072, 1024, 1, 64, 3072, 3072, 3072, 1024] - - [5, 7922.19] - - - [512, 1536, 1, 64, 512, 512, 512, 1536] - - [5, 2895.54] - - - [1024, 1536, 1, 64, 1024, 1024, 1024, 1536] - - [24, 4027.82] - - - [1536, 1536, 1, 64, 1536, 1536, 1536, 1536] - - [5, 6702.84] - - - [2048, 1536, 1, 64, 2048, 2048, 2048, 1536] - - [2, 7639.17] - - - [2560, 1536, 1, 64, 2560, 2560, 2560, 1536] - - [8, 9019.2] - - - [3072, 1536, 1, 64, 3072, 3072, 3072, 1536] - - [6, 9848.67] - - - [512, 2048, 1, 64, 512, 512, 512, 2048] - - [5, 3739.91] - - - [1024, 2048, 1, 64, 1024, 1024, 1024, 2048] - - [8, 6187.86] - - - [1536, 2048, 1, 64, 1536, 1536, 1536, 2048] - - [6, 7769.18] - - - [2048, 2048, 1, 64, 2048, 2048, 2048, 2048] - - [5, 9177.28] - - - [2560, 2048, 1, 64, 2560, 2560, 2560, 2048] - - [3, 10608.8] - - - [3072, 2048, 1, 64, 3072, 3072, 3072, 2048] - - [5, 12018.4] - - - [512, 2560, 1, 64, 512, 512, 512, 2560] - - [6, 4229.73] - - - [1024, 2560, 1, 64, 1024, 1024, 1024, 2560] - - [5, 7224.09] - - - [1536, 2560, 1, 64, 1536, 1536, 1536, 2560] - - [6, 8998.24] - - - [2048, 2560, 1, 64, 2048, 2048, 2048, 2560] - - [2, 10791.5] - - - [2560, 2560, 1, 64, 2560, 2560, 2560, 2560] - - [6, 12205.3] - - - [3072, 2560, 1, 64, 3072, 3072, 3072, 2560] - - [4, 12526.4] - - - [512, 3072, 1, 64, 512, 512, 512, 3072] - - [22, 3995.76] - - - [1024, 3072, 1, 64, 1024, 1024, 1024, 3072] - - [8, 7679.97] - - - [1536, 3072, 1, 64, 1536, 1536, 1536, 3072] - - [2, 10270.0] - - - [2048, 3072, 1, 64, 2048, 2048, 2048, 3072] - - [5, 11815.3] - - - [2560, 3072, 1, 64, 2560, 2560, 2560, 3072] - - [6, 12965.9] - - - [3072, 3072, 1, 64, 3072, 3072, 3072, 3072] - - [8, 13879.2] - - - [512, 512, 1, 256, 512, 512, 512, 512] - - [10, 3903.49] - - - [1024, 512, 1, 256, 1024, 1024, 1024, 512] - - [7, 6716.76] - - - [1536, 512, 1, 256, 1536, 1536, 1536, 512] - - [7, 8778.71] - - - [2048, 512, 1, 256, 2048, 2048, 2048, 512] - - [5, 10953.9] - - - [2560, 512, 1, 256, 2560, 2560, 2560, 512] - - [19, 12527.3] - - - [3072, 512, 1, 256, 3072, 3072, 3072, 512] - - [1, 9610.21] - - - [512, 1024, 1, 256, 512, 512, 512, 1024] - - [7, 6923.26] - - - [1024, 1024, 1, 256, 1024, 1024, 1024, 1024] - - [5, 11248.1] - - - [1536, 1024, 1, 256, 1536, 1536, 1536, 1024] - - [5, 14206.2] - - - [2048, 1024, 1, 256, 2048, 2048, 2048, 1024] - - [5, 16510.7] - - - [2560, 1024, 1, 256, 2560, 2560, 2560, 1024] - - [5, 18361.3] - - - [3072, 1024, 1, 256, 3072, 3072, 3072, 1024] - - [16, 19686.5] - - - [512, 1536, 1, 256, 512, 512, 512, 1536] - - [5, 8780.82] - - - [1024, 1536, 1, 256, 1024, 1024, 1024, 1536] - - [5, 14508.7] - - - [1536, 1536, 1, 256, 1536, 1536, 1536, 1536] - - [16, 16967.2] - - - [2048, 1536, 1, 256, 2048, 2048, 2048, 1536] - - [5, 19927.9] - - - [2560, 1536, 1, 256, 2560, 2560, 2560, 1536] - - [5, 21248.4] - - - [3072, 1536, 1, 256, 3072, 3072, 3072, 1536] - - [5, 22807.0] - - - [512, 2048, 1, 256, 512, 512, 512, 2048] - - [17, 7492.55] - - - [1024, 2048, 1, 256, 1024, 1024, 1024, 2048] - - [5, 16369.8] - - - [1536, 2048, 1, 256, 1536, 1536, 1536, 2048] - - [1, 19975.1] - - - [2048, 2048, 1, 256, 2048, 2048, 2048, 2048] - - [16, 21887.6] - - - [2560, 2048, 1, 256, 2560, 2560, 2560, 2048] - - [5, 23864.7] - - - [3072, 2048, 1, 256, 3072, 3072, 3072, 2048] - - [5, 25187.5] - - - [512, 2560, 1, 256, 512, 512, 512, 2560] - - [5, 12600.2] - - - [1024, 2560, 1, 256, 1024, 1024, 1024, 2560] - - [19, 18341.3] - - - [1536, 2560, 1, 256, 1536, 1536, 1536, 2560] - - [5, 21642.2] - - - [2048, 2560, 1, 256, 2048, 2048, 2048, 2560] - - [5, 23746.5] - - - [2560, 2560, 1, 256, 2560, 2560, 2560, 2560] - - [5, 25374.1] - - - [3072, 2560, 1, 256, 3072, 3072, 3072, 2560] - - [5, 26900.8] - - - [512, 3072, 1, 256, 512, 512, 512, 3072] - - [16, 14040.0] - - - [1024, 3072, 1, 256, 1024, 1024, 1024, 3072] - - [5, 20017.3] - - - [1536, 3072, 1, 256, 1536, 1536, 1536, 3072] - - [5, 22886.9] - - - [2048, 3072, 1, 256, 2048, 2048, 2048, 3072] - - [5, 25132.2] - - - [2560, 3072, 1, 256, 2560, 2560, 2560, 3072] - - [1, 26647.4] - - - [3072, 3072, 1, 256, 3072, 3072, 3072, 3072] - - [19, 27327.5] - - - [512, 512, 1, 512, 512, 512, 512, 512] - - [23, 6688.31] - - - [1024, 512, 1, 512, 1024, 1024, 1024, 512] - - [16, 11672.1] - - - [1536, 512, 1, 512, 1536, 1536, 1536, 512] - - [19, 14221.5] - - - [2048, 512, 1, 512, 2048, 2048, 2048, 512] - - [16, 17068.4] - - - [2560, 512, 1, 512, 2560, 2560, 2560, 512] - - [19, 18650.2] - - - [3072, 512, 1, 512, 3072, 3072, 3072, 512] - - [5, 20663.2] - - - [512, 1024, 1, 512, 512, 512, 512, 1024] - - [19, 7379.97] - - - [1024, 1024, 1, 512, 1024, 1024, 1024, 1024] - - [5, 16821.9] - - - [1536, 1024, 1, 512, 1536, 1536, 1536, 1024] - - [5, 20655.0] - - - [2048, 1024, 1, 512, 2048, 2048, 2048, 1024] - - [19, 22773.6] - - - [2560, 1024, 1, 512, 2560, 2560, 2560, 1024] - - [16, 24229.7] - - - [3072, 1024, 1, 512, 3072, 3072, 3072, 1024] - - [19, 25706.9] - - - [512, 1536, 1, 512, 512, 512, 512, 1536] - - [5, 14320.4] - - - [1024, 1536, 1, 512, 1024, 1024, 1024, 1536] - - [5, 20400.7] - - - [1536, 1536, 1, 512, 1536, 1536, 1536, 1536] - - [5, 23523.4] - - - [2048, 1536, 1, 512, 2048, 2048, 2048, 1536] - - [16, 26308.4] - - - [2560, 1536, 1, 512, 2560, 2560, 2560, 1536] - - [19, 27021.9] - - - [3072, 1536, 1, 512, 3072, 3072, 3072, 1536] - - [5, 29369.5] - - - [512, 2048, 1, 512, 512, 512, 512, 2048] - - [5, 16680.5] - - - [1024, 2048, 1, 512, 1024, 1024, 1024, 2048] - - [19, 22463.2] - - - [1536, 2048, 1, 512, 1536, 1536, 1536, 2048] - - [16, 25764.7] - - - [2048, 2048, 1, 512, 2048, 2048, 2048, 2048] - - [5, 27882.2] - - - [2560, 2048, 1, 512, 2560, 2560, 2560, 2048] - - [5, 29294.8] - - - [3072, 2048, 1, 512, 3072, 3072, 3072, 2048] - - [16, 30521.8] - - - [512, 2560, 1, 512, 512, 512, 512, 2560] - - [19, 18251.5] - - - [1024, 2560, 1, 512, 1024, 1024, 1024, 2560] - - [16, 19232.6] - - - [1536, 2560, 1, 512, 1536, 1536, 1536, 2560] - - [16, 27498.9] - - - [2048, 2560, 1, 512, 2048, 2048, 2048, 2560] - - [19, 29866.3] - - - [2560, 2560, 1, 512, 2560, 2560, 2560, 2560] - - [19, 30820.6] - - - [3072, 2560, 1, 512, 3072, 3072, 3072, 2560] - - [5, 32075.9] - - - [512, 3072, 1, 512, 512, 512, 512, 3072] - - [22, 15962.5] - - - [1024, 3072, 1, 512, 1024, 1024, 1024, 3072] - - [16, 25657.7] - - - [1536, 3072, 1, 512, 1536, 1536, 1536, 3072] - - [16, 28584.3] - - - [2048, 3072, 1, 512, 2048, 2048, 2048, 3072] - - [5, 30594.4] - - - [2560, 3072, 1, 512, 2560, 2560, 2560, 3072] - - [1, 30950.8] - - - [3072, 3072, 1, 512, 3072, 3072, 3072, 3072] - - [16, 32420.7] - - - [512, 512, 1, 1024, 512, 512, 512, 512] - - [10, 10387.0] - - - [1024, 512, 1, 1024, 1024, 1024, 1024, 512] - - [19, 17022.2] - - - [1536, 512, 1, 1024, 1536, 1536, 1536, 512] - - [5, 20059.7] - - - [2048, 512, 1, 1024, 2048, 2048, 2048, 512] - - [19, 22638.7] - - - [2560, 512, 1, 1024, 2560, 2560, 2560, 512] - - [20, 23365.2] - - - [3072, 512, 1, 1024, 3072, 3072, 3072, 512] - - [5, 26126.8] - - - [512, 1024, 1, 1024, 512, 512, 512, 1024] - - [0, 13164.4] - - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] - - [19, 23349.1] - - - [1536, 1024, 1, 1024, 1536, 1536, 1536, 1024] - - [16, 26046.3] - - - [2048, 1024, 1, 1024, 2048, 2048, 2048, 1024] - - [16, 28018.9] - - - [2560, 1024, 1, 1024, 2560, 2560, 2560, 1024] - - [5, 29343.0] - - - [3072, 1024, 1, 1024, 3072, 3072, 3072, 1024] - - [19, 30870.5] - - - [512, 1536, 1, 1024, 512, 512, 512, 1536] - - [19, 19795.4] - - - [1024, 1536, 1, 1024, 1024, 1024, 1024, 1536] - - [19, 25787.3] - - - [1536, 1536, 1, 1024, 1536, 1536, 1536, 1536] - - [16, 24877.5] - - - [2048, 1536, 1, 1024, 2048, 2048, 2048, 1536] - - [1, 30148.3] - - - [2560, 1536, 1, 1024, 2560, 2560, 2560, 1536] - - [19, 32041.5] - - - [3072, 1536, 1, 1024, 3072, 3072, 3072, 1536] - - [5, 32160.4] - - - [512, 2048, 1, 1024, 512, 512, 512, 2048] - - [19, 23000.9] - - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 2048] - - [19, 27958.6] - - - [1536, 2048, 1, 1024, 1536, 1536, 1536, 2048] - - [20, 30353.3] - - - [2048, 2048, 1, 1024, 2048, 2048, 2048, 2048] - - [16, 32910.4] - - - [2560, 2048, 1, 1024, 2560, 2560, 2560, 2048] - - [1, 33207.0] - - - [3072, 2048, 1, 1024, 3072, 3072, 3072, 2048] - - [1, 34410.8] - - - [512, 2560, 1, 1024, 512, 512, 512, 2560] - - [19, 19293.7] - - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 2560] - - [20, 28586.9] - - - [1536, 2560, 1, 1024, 1536, 1536, 1536, 2560] - - [5, 31516.4] - - - [2048, 2560, 1, 1024, 2048, 2048, 2048, 2560] - - [5, 32695.5] - - - [2560, 2560, 1, 1024, 2560, 2560, 2560, 2560] - - [16, 33584.5] - - - [3072, 2560, 1, 1024, 3072, 3072, 3072, 2560] - - [5, 34444.3] - - - [512, 3072, 1, 1024, 512, 512, 512, 3072] - - [5, 26250.5] - - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 3072] - - [16, 30288.9] - - - [1536, 3072, 1, 1024, 1536, 1536, 1536, 3072] - - [19, 32653.5] - - - [2048, 3072, 1, 1024, 2048, 2048, 2048, 3072] - - [19, 34049.1] - - - [2560, 3072, 1, 1024, 2560, 2560, 2560, 3072] - - [16, 34721.0] - - - [3072, 3072, 1, 1024, 3072, 3072, 3072, 3072] - - [5, 34981.8] - - - [512, 512, 1, 2048, 512, 512, 512, 512] - - [11, 15358.5] - - - [1024, 512, 1, 2048, 1024, 1024, 1024, 512] - - [1, 23129.8] - - - [1536, 512, 1, 2048, 1536, 1536, 1536, 512] - - [19, 24494.7] - - - [2048, 512, 1, 2048, 2048, 2048, 2048, 512] - - [5, 27958.6] - - - [2560, 512, 1, 2048, 2560, 2560, 2560, 512] - - [16, 28495.7] - - - [3072, 512, 1, 2048, 3072, 3072, 3072, 512] - - [19, 29965.7] - - - [512, 1024, 1, 2048, 512, 512, 512, 1024] - - [19, 22924.3] - - - [1024, 1024, 1, 2048, 1024, 1024, 1024, 1024] - - [16, 27329.3] - - - [1536, 1024, 1, 2048, 1536, 1536, 1536, 1024] - - [19, 30559.4] - - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 1024] - - [19, 31352.9] - - - [2560, 1024, 1, 2048, 2560, 2560, 2560, 1024] - - [16, 32903.2] - - - [3072, 1024, 1, 2048, 3072, 3072, 3072, 1024] - - [19, 34230.3] - - - [512, 1536, 1, 2048, 512, 512, 512, 1536] - - [19, 24394.2] - - - [1024, 1536, 1, 2048, 1024, 1024, 1024, 1536] - - [5, 29957.4] - - - [1536, 1536, 1, 2048, 1536, 1536, 1536, 1536] - - [5, 32132.4] - - - [2048, 1536, 1, 2048, 2048, 2048, 2048, 1536] - - [5, 32967.6] - - - [2560, 1536, 1, 2048, 2560, 2560, 2560, 1536] - - [19, 34108.8] - - - [3072, 1536, 1, 2048, 3072, 3072, 3072, 1536] - - [5, 35303.1] - - - [512, 2048, 1, 2048, 512, 512, 512, 2048] - - [19, 27543.7] - - - [1024, 2048, 1, 2048, 1024, 1024, 1024, 2048] - - [19, 31852.7] - - - [1536, 2048, 1, 2048, 1536, 1536, 1536, 2048] - - [5, 33429.5] - - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] - - [5, 35261.2] - - - [2560, 2048, 1, 2048, 2560, 2560, 2560, 2048] - - [19, 35012.8] - - - [3072, 2048, 1, 2048, 3072, 3072, 3072, 2048] - - [5, 34927.5] - - - [512, 2560, 1, 2048, 512, 512, 512, 2560] - - [19, 28473.1] - - - [1024, 2560, 1, 2048, 1024, 1024, 1024, 2560] - - [16, 32443.1] - - - [1536, 2560, 1, 2048, 1536, 1536, 1536, 2560] - - [19, 33703.4] - - - [2048, 2560, 1, 2048, 2048, 2048, 2048, 2560] - - [19, 34952.8] - - - [2560, 2560, 1, 2048, 2560, 2560, 2560, 2560] - - [19, 35427.5] - - - [3072, 2560, 1, 2048, 3072, 3072, 3072, 2560] - - [19, 35568.4] - - - [512, 3072, 1, 2048, 512, 512, 512, 3072] - - [1, 30643.9] - - - [1024, 3072, 1, 2048, 1024, 1024, 1024, 3072] - - [5, 33135.0] - - - [1536, 3072, 1, 2048, 1536, 1536, 1536, 3072] - - [19, 35086.0] - - - [2048, 3072, 1, 2048, 2048, 2048, 2048, 3072] - - [16, 35286.5] - - - [2560, 3072, 1, 2048, 2560, 2560, 2560, 3072] - - [5, 35540.8] - - - [3072, 3072, 1, 2048, 3072, 3072, 3072, 3072] - - [19, 35994.4] - - - [512, 512, 1, 3072, 512, 512, 512, 512] - - [10, 17907.8] - - - [1024, 512, 1, 3072, 1024, 1024, 1024, 512] - - [16, 25523.4] - - - [1536, 512, 1, 3072, 1536, 1536, 1536, 512] - - [19, 26395.7] - - - [2048, 512, 1, 3072, 2048, 2048, 2048, 512] - - [16, 29236.7] - - - [2560, 512, 1, 3072, 2560, 2560, 2560, 512] - - [5, 30260.9] - - - [3072, 512, 1, 3072, 3072, 3072, 3072, 512] - - [19, 31219.3] - - - [512, 1024, 1, 3072, 512, 512, 512, 1024] - - [3, 24676.9] - - - [1024, 1024, 1, 3072, 1024, 1024, 1024, 1024] - - [19, 29555.2] - - - [1536, 1024, 1, 3072, 1536, 1536, 1536, 1024] - - [5, 31553.2] - - - [2048, 1024, 1, 3072, 2048, 2048, 2048, 1024] - - [5, 32571.0] - - - [2560, 1024, 1, 3072, 2560, 2560, 2560, 1024] - - [16, 33635.7] - - - [3072, 1024, 1, 3072, 3072, 3072, 3072, 1024] - - [1, 34003.7] - - - [512, 1536, 1, 3072, 512, 512, 512, 1536] - - [5, 26965.3] - - - [1024, 1536, 1, 3072, 1024, 1024, 1024, 1536] - - [16, 31697.3] - - - [1536, 1536, 1, 3072, 1536, 1536, 1536, 1536] - - [19, 33471.2] - - - [2048, 1536, 1, 3072, 2048, 2048, 2048, 1536] - - [19, 34142.1] - - - [2560, 1536, 1, 3072, 2560, 2560, 2560, 1536] - - [5, 34520.7] - - - [3072, 1536, 1, 3072, 3072, 3072, 3072, 1536] - - [1, 35005.5] - - - [512, 2048, 1, 3072, 512, 512, 512, 2048] - - [1, 30031.5] - - - [1024, 2048, 1, 3072, 1024, 1024, 1024, 2048] - - [19, 33287.7] - - - [1536, 2048, 1, 3072, 1536, 1536, 1536, 2048] - - [16, 34538.9] - - - [2048, 2048, 1, 3072, 2048, 2048, 2048, 2048] - - [5, 34890.2] - - - [2560, 2048, 1, 3072, 2560, 2560, 2560, 2048] - - [19, 35540.9] - - - [3072, 2048, 1, 3072, 3072, 3072, 3072, 2048] - - [19, 35493.7] - - - [512, 2560, 1, 3072, 512, 512, 512, 2560] - - [19, 30412.2] - - - [1024, 2560, 1, 3072, 1024, 1024, 1024, 2560] - - [16, 34182.2] - - - [1536, 2560, 1, 3072, 1536, 1536, 1536, 2560] - - [5, 34612.4] - - - [2048, 2560, 1, 3072, 2048, 2048, 2048, 2560] - - [5, 34945.9] - - - [2560, 2560, 1, 3072, 2560, 2560, 2560, 2560] - - [19, 35803.3] - - - [3072, 2560, 1, 3072, 3072, 3072, 3072, 2560] - - [1, 35815.1] - - - [512, 3072, 1, 3072, 512, 512, 512, 3072] - - [1, 32031.1] - - - [1024, 3072, 1, 3072, 1024, 1024, 1024, 3072] - - [19, 33794.0] - - - [1536, 3072, 1, 3072, 1536, 1536, 1536, 3072] - - [1, 34947.1] - - - [2048, 3072, 1, 3072, 2048, 2048, 2048, 3072] - - [19, 35799.6] - - - [2560, 3072, 1, 3072, 2560, 2560, 2560, 3072] - - [1, 35674.7] - - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] - - [19, 36047.4] - - - [1, 1, 1, 1, 1, 1, 1, 1] - - [13, 6.37979e-05] - - - [1, 1, 1, 64, 1, 1, 1, 1] - - [15, 0.004] - - - [1, 64, 1, 1, 1, 1, 1, 64] - - [13, 0.00423883] - - - [64, 1, 1, 1, 64, 64, 64, 1] - - [13, 0.00425858] - - - [64, 64, 1, 1, 64, 64, 64, 64] - - [9, 0.269323] - - - [64, 1, 1, 64, 64, 64, 64, 1] - - [12, 0.267914] - - - [1, 64, 1, 64, 1, 1, 1, 64] - - [15, 0.266945] - - - [64, 64, 1, 64, 64, 64, 64, 64] - - [14, 17.0789] - - - [64, 64, 1, 256, 64, 64, 64, 64] - - [14, 62.8228] - - - [64, 64, 1, 512, 64, 64, 64, 64] - - [21, 117.235] - - - [64, 64, 1, 1024, 64, 64, 64, 64] - - [9, 196.083] - - - [64, 64, 1, 2048, 64, 64, 64, 64] - - [21, 307.602] - - - [64, 64, 1, 4096, 64, 64, 64, 64] - - [9, 442.484] -- null -- null -- DeviceEfficiency -- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml deleted file mode 100644 index dfa46992dc0..00000000000 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml +++ /dev/null @@ -1,7273 +0,0 @@ -- {MinimumRequiredVersion: 4.33.0} -- gfx1201 -- gfx1201 -- [Device 73f0] -- Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false -- - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 0 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA256_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 16 - LSCB: 32 - LSPA: 8 - LSPB: 4 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14464 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1152 - LdsOffsetMetadata_Blk: 9344 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA256_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 7 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 8 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB512_MIWT1_1_NLCA1_PGR1_PLR3_SS0_SVW8_TLDS0_WG32_4_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 9 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB512_MIWT1_1_NLCA1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_TLDS0_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR3_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 10 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29952 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 11 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 12 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 13 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 14 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 15 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 16 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA256_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 16 - LSCB: 32 - LSPA: 8 - LSPB: 4 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14464 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1152 - LdsOffsetMetadata_Blk: 9344 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 17 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA256_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA256_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 16 - LSCB: 32 - LSPA: 8 - LSPB: 4 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14464 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1152 - LdsOffsetMetadata_Blk: 9344 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 18 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA256_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 19 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 20 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 21 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR3_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 22 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR3_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 23 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR3_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29952 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 24 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_CLR0_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 30976 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 10240 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 25 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 -- [2, 3, 0, 1] -- - - [512, 512, 1, 64, 512, 512, 512, 64] - - [16, 1102.78] - - - [1024, 512, 1, 64, 1024, 1024, 1024, 64] - - [6, 2110.34] - - - [1536, 512, 1, 64, 1536, 1536, 1536, 64] - - [1, 2716.22] - - - [2048, 512, 1, 64, 2048, 2048, 2048, 64] - - [2, 3824.3] - - - [2560, 512, 1, 64, 2560, 2560, 2560, 64] - - [3, 4454.8] - - - [3072, 512, 1, 64, 3072, 3072, 3072, 64] - - [17, 4254.67] - - - [512, 1024, 1, 64, 512, 512, 512, 64] - - [14, 2122.35] - - - [1024, 1024, 1, 64, 1024, 1024, 1024, 64] - - [20, 3584.78] - - - [1536, 1024, 1, 64, 1536, 1536, 1536, 64] - - [3, 5123.6] - - - [2048, 1024, 1, 64, 2048, 2048, 2048, 64] - - [3, 6276.11] - - - [2560, 1024, 1, 64, 2560, 2560, 2560, 64] - - [6, 6942.63] - - - [3072, 1024, 1, 64, 3072, 3072, 3072, 64] - - [7, 7808.35] - - - [512, 1536, 1, 64, 512, 512, 512, 64] - - [4, 2789.38] - - - [1024, 1536, 1, 64, 1024, 1024, 1024, 64] - - [3, 5102.82] - - - [1536, 1536, 1, 64, 1536, 1536, 1536, 64] - - [2, 6741.75] - - - [2048, 1536, 1, 64, 2048, 2048, 2048, 64] - - [3, 8102.81] - - - [2560, 1536, 1, 64, 2560, 2560, 2560, 64] - - [2, 9207.63] - - - [3072, 1536, 1, 64, 3072, 3072, 3072, 64] - - [3, 10299.8] - - - [512, 2048, 1, 64, 512, 512, 512, 64] - - [7, 3766.25] - - - [1024, 2048, 1, 64, 1024, 1024, 1024, 64] - - [2, 6254.18] - - - [1536, 2048, 1, 64, 1536, 1536, 1536, 64] - - [3, 7919.07] - - - [2048, 2048, 1, 64, 2048, 2048, 2048, 64] - - [2, 9321.0] - - - [2560, 2048, 1, 64, 2560, 2560, 2560, 64] - - [3, 10635.7] - - - [3072, 2048, 1, 64, 3072, 3072, 3072, 64] - - [3, 12069.0] - - - [512, 2560, 1, 64, 512, 512, 512, 64] - - [3, 4218.03] - - - [1024, 2560, 1, 64, 1024, 1024, 1024, 64] - - [2, 7070.04] - - - [1536, 2560, 1, 64, 1536, 1536, 1536, 64] - - [6, 9325.68] - - - [2048, 2560, 1, 64, 2048, 2048, 2048, 64] - - [3, 11009.6] - - - [2560, 2560, 1, 64, 2560, 2560, 2560, 64] - - [3, 12362.0] - - - [3072, 2560, 1, 64, 3072, 3072, 3072, 64] - - [7, 13337.8] - - - [512, 3072, 1, 64, 512, 512, 512, 64] - - [6, 5155.08] - - - [1024, 3072, 1, 64, 1024, 1024, 1024, 64] - - [7, 7920.79] - - - [1536, 3072, 1, 64, 1536, 1536, 1536, 64] - - [6, 10022.4] - - - [2048, 3072, 1, 64, 2048, 2048, 2048, 64] - - [3, 12052.6] - - - [2560, 3072, 1, 64, 2560, 2560, 2560, 64] - - [3, 13362.6] - - - [3072, 3072, 1, 64, 3072, 3072, 3072, 64] - - [2, 14403.3] - - - [512, 512, 1, 256, 512, 512, 512, 256] - - [0, 3747.32] - - - [1024, 512, 1, 256, 1024, 1024, 1024, 256] - - [6, 7243.46] - - - [1536, 512, 1, 256, 1536, 1536, 1536, 256] - - [8, 7307.28] - - - [2048, 512, 1, 256, 2048, 2048, 2048, 256] - - [2, 11509.0] - - - [2560, 512, 1, 256, 2560, 2560, 2560, 256] - - [2, 12441.2] - - - [3072, 512, 1, 256, 3072, 3072, 3072, 256] - - [14, 14099.2] - - - [512, 1024, 1, 256, 512, 512, 512, 256] - - [14, 7025.08] - - - [1024, 1024, 1, 256, 1024, 1024, 1024, 256] - - [6, 11538.9] - - - [1536, 1024, 1, 256, 1536, 1536, 1536, 256] - - [6, 14617.0] - - - [2048, 1024, 1, 256, 2048, 2048, 2048, 256] - - [2, 16680.5] - - - [2560, 1024, 1, 256, 2560, 2560, 2560, 256] - - [6, 18354.1] - - - [3072, 1024, 1, 256, 3072, 3072, 3072, 256] - - [2, 20047.2] - - - [512, 1536, 1, 256, 512, 512, 512, 256] - - [2, 9348.59] - - - [1024, 1536, 1, 256, 1024, 1024, 1024, 256] - - [6, 14485.5] - - - [1536, 1536, 1, 256, 1536, 1536, 1536, 256] - - [6, 17521.9] - - - [2048, 1536, 1, 256, 2048, 2048, 2048, 256] - - [2, 20114.8] - - - [2560, 1536, 1, 256, 2560, 2560, 2560, 256] - - [6, 22041.2] - - - [3072, 1536, 1, 256, 3072, 3072, 3072, 256] - - [2, 23013.8] - - - [512, 2048, 1, 256, 512, 512, 512, 256] - - [2, 11085.3] - - - [1024, 2048, 1, 256, 1024, 1024, 1024, 256] - - [2, 16850.9] - - - [1536, 2048, 1, 256, 1536, 1536, 1536, 256] - - [2, 20295.0] - - - [2048, 2048, 1, 256, 2048, 2048, 2048, 256] - - [2, 22703.6] - - - [2560, 2048, 1, 256, 2560, 2560, 2560, 256] - - [2, 24267.1] - - - [3072, 2048, 1, 256, 3072, 3072, 3072, 256] - - [6, 26107.7] - - - [512, 2560, 1, 256, 512, 512, 512, 256] - - [2, 12965.9] - - - [1024, 2560, 1, 256, 1024, 1024, 1024, 256] - - [2, 18366.6] - - - [1536, 2560, 1, 256, 1536, 1536, 1536, 256] - - [2, 22004.8] - - - [2048, 2560, 1, 256, 2048, 2048, 2048, 256] - - [2, 24317.7] - - - [2560, 2560, 1, 256, 2560, 2560, 2560, 256] - - [2, 25695.5] - - - [3072, 2560, 1, 256, 3072, 3072, 3072, 256] - - [2, 26936.9] - - - [512, 3072, 1, 256, 512, 512, 512, 256] - - [14, 14423.2] - - - [1024, 3072, 1, 256, 1024, 1024, 1024, 256] - - [6, 20027.0] - - - [1536, 3072, 1, 256, 1536, 1536, 1536, 256] - - [2, 23488.8] - - - [2048, 3072, 1, 256, 2048, 2048, 2048, 256] - - [19, 25372.4] - - - [2560, 3072, 1, 256, 2560, 2560, 2560, 256] - - [2, 26893.4] - - - [3072, 3072, 1, 256, 3072, 3072, 3072, 256] - - [6, 28682.9] - - - [512, 512, 1, 512, 512, 512, 512, 512] - - [11, 6659.94] - - - [1024, 512, 1, 512, 1024, 1024, 1024, 512] - - [14, 11566.3] - - - [1536, 512, 1, 512, 1536, 1536, 1536, 512] - - [6, 14604.0] - - - [2048, 512, 1, 512, 2048, 2048, 2048, 512] - - [14, 17389.7] - - - [2560, 512, 1, 512, 2560, 2560, 2560, 512] - - [6, 19021.2] - - - [3072, 512, 1, 512, 3072, 3072, 3072, 512] - - [19, 20904.8] - - - [512, 1024, 1, 512, 512, 512, 512, 512] - - [14, 11964.0] - - - [1024, 1024, 1, 512, 1024, 1024, 1024, 512] - - [6, 17177.7] - - - [1536, 1024, 1, 512, 1536, 1536, 1536, 512] - - [2, 20896.7] - - - [2048, 1024, 1, 512, 2048, 2048, 2048, 512] - - [2, 17762.9] - - - [2560, 1024, 1, 512, 2560, 2560, 2560, 512] - - [19, 25167.4] - - - [3072, 1024, 1, 512, 3072, 3072, 3072, 512] - - [6, 26659.8] - - - [512, 1536, 1, 512, 512, 512, 512, 512] - - [2, 14577.3] - - - [1024, 1536, 1, 512, 1024, 1024, 1024, 512] - - [25, 15994.3] - - - [1536, 1536, 1, 512, 1536, 1536, 1536, 512] - - [14, 23856.2] - - - [2048, 1536, 1, 512, 2048, 2048, 2048, 512] - - [14, 26373.2] - - - [2560, 1536, 1, 512, 2560, 2560, 2560, 512] - - [6, 27608.5] - - - [3072, 1536, 1, 512, 3072, 3072, 3072, 512] - - [2, 29444.8] - - - [512, 2048, 1, 512, 512, 512, 512, 512] - - [6, 17566.3] - - - [1024, 2048, 1, 512, 1024, 1024, 1024, 512] - - [2, 22682.0] - - - [1536, 2048, 1, 512, 1536, 1536, 1536, 512] - - [14, 26297.9] - - - [2048, 2048, 1, 512, 2048, 2048, 2048, 512] - - [2, 28275.7] - - - [2560, 2048, 1, 512, 2560, 2560, 2560, 512] - - [2, 29643.4] - - - [3072, 2048, 1, 512, 3072, 3072, 3072, 512] - - [2, 31025.5] - - - [512, 2560, 1, 512, 512, 512, 512, 512] - - [14, 18754.9] - - - [1024, 2560, 1, 512, 1024, 1024, 1024, 512] - - [2, 24558.2] - - - [1536, 2560, 1, 512, 1536, 1536, 1536, 512] - - [6, 27629.3] - - - [2048, 2560, 1, 512, 2048, 2048, 2048, 512] - - [6, 29677.8] - - - [2560, 2560, 1, 512, 2560, 2560, 2560, 512] - - [6, 31297.4] - - - [3072, 2560, 1, 512, 3072, 3072, 3072, 512] - - [6, 32040.3] - - - [512, 3072, 1, 512, 512, 512, 512, 512] - - [2, 20975.6] - - - [1024, 3072, 1, 512, 1024, 1024, 1024, 512] - - [2, 26490.8] - - - [1536, 3072, 1, 512, 1536, 1536, 1536, 512] - - [14, 29040.5] - - - [2048, 3072, 1, 512, 2048, 2048, 2048, 512] - - [6, 31019.4] - - - [2560, 3072, 1, 512, 2560, 2560, 2560, 512] - - [2, 31626.8] - - - [3072, 3072, 1, 512, 3072, 3072, 3072, 512] - - [2, 33263.9] - - - [512, 512, 1, 1024, 512, 512, 512, 1024] - - [11, 10976.3] - - - [1024, 512, 1, 1024, 1024, 1024, 1024, 1024] - - [14, 17862.1] - - - [1536, 512, 1, 1024, 1536, 1536, 1536, 1024] - - [24, 17439.6] - - - [2048, 512, 1, 1024, 2048, 2048, 2048, 1024] - - [14, 23430.8] - - - [2560, 512, 1, 1024, 2560, 2560, 2560, 1024] - - [14, 24764.8] - - - [3072, 512, 1, 1024, 3072, 3072, 3072, 1024] - - [2, 26835.5] - - - [512, 1024, 1, 1024, 512, 512, 512, 1024] - - [19, 18085.3] - - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] - - [2, 23671.6] - - - [1536, 1024, 1, 1024, 1536, 1536, 1536, 1024] - - [2, 26453.8] - - - [2048, 1024, 1, 1024, 2048, 2048, 2048, 1024] - - [2, 28648.2] - - - [2560, 1024, 1, 1024, 2560, 2560, 2560, 1024] - - [14, 29639.9] - - - [3072, 1024, 1, 1024, 3072, 3072, 3072, 1024] - - [6, 30994.0] - - - [512, 1536, 1, 1024, 512, 512, 512, 1024] - - [19, 20362.0] - - - [1024, 1536, 1, 1024, 1024, 1024, 1024, 1024] - - [19, 26651.0] - - - [1536, 1536, 1, 1024, 1536, 1536, 1536, 1024] - - [14, 28667.6] - - - [2048, 1536, 1, 1024, 2048, 2048, 2048, 1024] - - [2, 31269.9] - - - [2560, 1536, 1, 1024, 2560, 2560, 2560, 1024] - - [19, 31786.8] - - - [3072, 1536, 1, 1024, 3072, 3072, 3072, 1024] - - [2, 32638.1] - - - [512, 2048, 1, 1024, 512, 512, 512, 1024] - - [19, 23477.0] - - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 1024] - - [2, 28711.4] - - - [1536, 2048, 1, 1024, 1536, 1536, 1536, 1024] - - [2, 31338.4] - - - [2048, 2048, 1, 1024, 2048, 2048, 2048, 1024] - - [2, 32280.9] - - - [2560, 2048, 1, 1024, 2560, 2560, 2560, 1024] - - [14, 33493.4] - - - [3072, 2048, 1, 1024, 3072, 3072, 3072, 1024] - - [2, 33769.3] - - - [512, 2560, 1, 1024, 512, 512, 512, 1024] - - [6, 24833.8] - - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 1024] - - [2, 29926.4] - - - [1536, 2560, 1, 1024, 1536, 1536, 1536, 1024] - - [6, 31938.4] - - - [2048, 2560, 1, 1024, 2048, 2048, 2048, 1024] - - [14, 33211.2] - - - [2560, 2560, 1, 1024, 2560, 2560, 2560, 1024] - - [2, 34754.8] - - - [3072, 2560, 1, 1024, 3072, 3072, 3072, 1024] - - [14, 35387.6] - - - [512, 3072, 1, 1024, 512, 512, 512, 1024] - - [14, 26864.8] - - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 1024] - - [14, 31058.4] - - - [1536, 3072, 1, 1024, 1536, 1536, 1536, 1024] - - [14, 32940.1] - - - [2048, 3072, 1, 1024, 2048, 2048, 2048, 1024] - - [14, 34567.0] - - - [2560, 3072, 1, 1024, 2560, 2560, 2560, 1024] - - [14, 34955.1] - - - [3072, 3072, 1, 1024, 3072, 3072, 3072, 1024] - - [2, 35700.4] - - - [512, 512, 1, 2048, 512, 512, 512, 2048] - - [12, 15600.1] - - - [1024, 512, 1, 2048, 1024, 1024, 1024, 2048] - - [19, 23765.9] - - - [1536, 512, 1, 2048, 1536, 1536, 1536, 2048] - - [19, 20093.5] - - - [2048, 512, 1, 2048, 2048, 2048, 2048, 2048] - - [6, 27942.4] - - - [2560, 512, 1, 2048, 2560, 2560, 2560, 2048] - - [6, 28709.5] - - - [3072, 512, 1, 2048, 3072, 3072, 3072, 2048] - - [6, 30761.4] - - - [512, 1024, 1, 2048, 512, 512, 512, 2048] - - [6, 23871.8] - - - [1024, 1024, 1, 2048, 1024, 1024, 1024, 2048] - - [19, 28220.0] - - - [1536, 1024, 1, 2048, 1536, 1536, 1536, 2048] - - [14, 30778.9] - - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 2048] - - [2, 32535.7] - - - [2560, 1024, 1, 2048, 2560, 2560, 2560, 2048] - - [19, 33459.0] - - - [3072, 1024, 1, 2048, 3072, 3072, 3072, 2048] - - [19, 34043.7] - - - [512, 1536, 1, 2048, 512, 512, 512, 2048] - - [14, 24983.7] - - - [1024, 1536, 1, 2048, 1024, 1024, 1024, 2048] - - [2, 30956.8] - - - [1536, 1536, 1, 2048, 1536, 1536, 1536, 2048] - - [2, 32333.8] - - - [2048, 1536, 1, 2048, 2048, 2048, 2048, 2048] - - [2, 33769.3] - - - [2560, 1536, 1, 2048, 2560, 2560, 2560, 2048] - - [14, 34579.2] - - - [3072, 1536, 1, 2048, 3072, 3072, 3072, 2048] - - [19, 34607.7] - - - [512, 2048, 1, 2048, 512, 512, 512, 2048] - - [15, 27721.8] - - - [1024, 2048, 1, 2048, 1024, 1024, 1024, 2048] - - [19, 32373.6] - - - [1536, 2048, 1, 2048, 1536, 1536, 1536, 2048] - - [2, 34114.2] - - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] - - [2, 34644.4] - - - [2560, 2048, 1, 2048, 2560, 2560, 2560, 2048] - - [2, 35409.6] - - - [3072, 2048, 1, 2048, 3072, 3072, 3072, 2048] - - [19, 36021.8] - - - [512, 2560, 1, 2048, 512, 512, 512, 2048] - - [2, 28845.4] - - - [1024, 2560, 1, 2048, 1024, 1024, 1024, 2048] - - [2, 33035.2] - - - [1536, 2560, 1, 2048, 1536, 1536, 1536, 2048] - - [2, 34694.1] - - - [2048, 2560, 1, 2048, 2048, 2048, 2048, 2048] - - [6, 35699.2] - - - [2560, 2560, 1, 2048, 2560, 2560, 2560, 2048] - - [19, 35409.2] - - - [3072, 2560, 1, 2048, 3072, 3072, 3072, 2048] - - [6, 35717.4] - - - [512, 3072, 1, 2048, 512, 512, 512, 2048] - - [2, 30754.1] - - - [1024, 3072, 1, 2048, 1024, 1024, 1024, 2048] - - [19, 34233.1] - - - [1536, 3072, 1, 2048, 1536, 1536, 1536, 2048] - - [14, 35127.6] - - - [2048, 3072, 1, 2048, 2048, 2048, 2048, 2048] - - [19, 35648.4] - - - [2560, 3072, 1, 2048, 2560, 2560, 2560, 2048] - - [6, 35884.1] - - - [3072, 3072, 1, 2048, 3072, 3072, 3072, 2048] - - [6, 37382.1] - - - [512, 512, 1, 3072, 512, 512, 512, 3072] - - [12, 18127.9] - - - [1024, 512, 1, 3072, 1024, 1024, 1024, 3072] - - [19, 26442.7] - - - [1536, 512, 1, 3072, 1536, 1536, 1536, 3072] - - [6, 26322.2] - - - [2048, 512, 1, 3072, 2048, 2048, 2048, 3072] - - [19, 30968.7] - - - [2560, 512, 1, 3072, 2560, 2560, 2560, 3072] - - [14, 30018.1] - - - [3072, 512, 1, 3072, 3072, 3072, 3072, 3072] - - [6, 32424.1] - - - [512, 1024, 1, 3072, 512, 512, 512, 3072] - - [2, 27073.0] - - - [1024, 1024, 1, 3072, 1024, 1024, 1024, 3072] - - [2, 30600.4] - - - [1536, 1024, 1, 3072, 1536, 1536, 1536, 3072] - - [19, 32482.0] - - - [2048, 1024, 1, 3072, 2048, 2048, 2048, 3072] - - [19, 33371.5] - - - [2560, 1024, 1, 3072, 2560, 2560, 2560, 3072] - - [14, 34360.5] - - - [3072, 1024, 1, 3072, 3072, 3072, 3072, 3072] - - [2, 34915.3] - - - [512, 1536, 1, 3072, 512, 512, 512, 3072] - - [2, 27553.8] - - - [1024, 1536, 1, 3072, 1024, 1024, 1024, 3072] - - [2, 32703.2] - - - [1536, 1536, 1, 3072, 1536, 1536, 1536, 3072] - - [6, 33662.8] - - - [2048, 1536, 1, 3072, 2048, 2048, 2048, 3072] - - [2, 34536.4] - - - [2560, 1536, 1, 3072, 2560, 2560, 2560, 3072] - - [6, 35147.2] - - - [3072, 1536, 1, 3072, 3072, 3072, 3072, 3072] - - [19, 35200.9] - - - [512, 2048, 1, 3072, 512, 512, 512, 3072] - - [2, 31780.0] - - - [1024, 2048, 1, 3072, 1024, 1024, 1024, 3072] - - [14, 33546.5] - - - [1536, 2048, 1, 3072, 1536, 1536, 1536, 3072] - - [14, 34553.2] - - - [2048, 2048, 1, 3072, 2048, 2048, 2048, 3072] - - [14, 35790.3] - - - [2560, 2048, 1, 3072, 2560, 2560, 2560, 3072] - - [2, 36033.7] - - - [3072, 2048, 1, 3072, 3072, 3072, 3072, 3072] - - [6, 36455.0] - - - [512, 2560, 1, 3072, 512, 512, 512, 3072] - - [6, 31027.4] - - - [1024, 2560, 1, 3072, 1024, 1024, 1024, 3072] - - [6, 35006.1] - - - [1536, 2560, 1, 3072, 1536, 1536, 1536, 3072] - - [19, 35047.6] - - - [2048, 2560, 1, 3072, 2048, 2048, 2048, 3072] - - [2, 36074.1] - - - [2560, 2560, 1, 3072, 2560, 2560, 2560, 3072] - - [14, 36488.0] - - - [3072, 2560, 1, 3072, 3072, 3072, 3072, 3072] - - [14, 36276.3] - - - [512, 3072, 1, 3072, 512, 512, 512, 3072] - - [2, 33344.3] - - - [1024, 3072, 1, 3072, 1024, 1024, 1024, 3072] - - [14, 35036.4] - - - [1536, 3072, 1, 3072, 1536, 1536, 1536, 3072] - - [14, 36277.7] - - - [2048, 3072, 1, 3072, 2048, 2048, 2048, 3072] - - [2, 35989.7] - - - [2560, 3072, 1, 3072, 2560, 2560, 2560, 3072] - - [6, 36792.8] - - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] - - [2, 37505.2] - - - [1, 1, 1, 1, 1, 1, 1, 1] - - [13, 6.53659e-05] - - - [1, 1, 1, 64, 1, 1, 1, 64] - - [23, 0.00415476] - - - [1, 64, 1, 1, 1, 1, 1, 1] - - [5, 0.00401506] - - - [64, 1, 1, 1, 64, 64, 64, 1] - - [12, 0.00426141] - - - [64, 64, 1, 1, 64, 64, 64, 1] - - [18, 0.268344] - - - [64, 1, 1, 64, 64, 64, 64, 64] - - [22, 0.267642] - - - [1, 64, 1, 64, 1, 1, 1, 64] - - [22, 0.266519] - - - [64, 64, 1, 64, 64, 64, 64, 64] - - [9, 16.5652] - - - [64, 64, 1, 256, 64, 64, 64, 256] - - [10, 65.3522] - - - [64, 64, 1, 512, 64, 64, 64, 512] - - [22, 116.908] - - - [64, 64, 1, 1024, 64, 64, 64, 1024] - - [22, 202.047] - - - [64, 64, 1, 2048, 64, 64, 64, 2048] - - [22, 318.541] - - - [64, 64, 1, 4096, 64, 64, 64, 4096] - - [21, 435.687] -- null -- null -- DeviceEfficiency -- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml deleted file mode 100644 index b1fd92133b3..00000000000 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml +++ /dev/null @@ -1,8050 +0,0 @@ -- {MinimumRequiredVersion: 4.33.0} -- gfx1201 -- gfx1201 -- [Device 73f0] -- Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false -- - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 0 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT2_1_PGR2_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB512_MIWT2_1_PGR2_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR2_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS0_SVW8_TLDS0_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS0_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR2_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 7 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 8 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB1024_MIWT2_1_PGR2_PLR0_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 9 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR2_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 10 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR3_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 11 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 30976 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 26624 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 26624 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 12 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR3_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 30976 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 26624 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 26624 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 13 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB1024_MIWT1_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 27392 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 18944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 18944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 14 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB1024_MIWT1_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB256_MIWT1_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG64_2_1 - LSCA: 32 - LSCB: 16 - LSPA: 4 - LSPB: 8 - LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14464 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 15 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB256_MIWT1_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG64_2_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [64, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 16 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT1_1_PGR2_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 17 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR2_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB512_MIWT1_1_PGR2_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 18 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 19 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 20 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 21 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 22 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA512_LBSPPB1024_MIWT2_1_PGR2_PLR0_SS1_SVW1_TLDS0_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 23 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS0_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 24 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 25 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 30976 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 26624 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 26624 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 26 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB1024_MIWT1_1_PGR1_PLR3_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 27392 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 18944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 18944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 27 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB1024_MIWT1_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR3_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29952 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 28 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 -- [2, 3, 0, 1] -- - - [512, 512, 1, 64, 512, 512, 64, 512] - - [4, 1083.17] - - - [1024, 512, 1, 64, 1024, 1024, 64, 512] - - [6, 2149.62] - - - [1536, 512, 1, 64, 1536, 1536, 64, 512] - - [3, 2862.44] - - - [2048, 512, 1, 64, 2048, 2048, 64, 512] - - [1, 3757.71] - - - [2560, 512, 1, 64, 2560, 2560, 64, 512] - - [9, 4330.39] - - - [3072, 512, 1, 64, 3072, 3072, 64, 512] - - [1, 5161.69] - - - [512, 1024, 1, 64, 512, 512, 64, 1024] - - [19, 2075.04] - - - [1024, 1024, 1, 64, 1024, 1024, 64, 1024] - - [20, 3608.97] - - - [1536, 1024, 1, 64, 1536, 1536, 64, 1024] - - [23, 4639.72] - - - [2048, 1024, 1, 64, 2048, 2048, 64, 1024] - - [1, 6143.81] - - - [2560, 1024, 1, 64, 2560, 2560, 64, 1024] - - [1, 7283.99] - - - [3072, 1024, 1, 64, 3072, 3072, 64, 1024] - - [6, 7999.47] - - - [512, 1536, 1, 64, 512, 512, 64, 1536] - - [6, 2970.91] - - - [1024, 1536, 1, 64, 1024, 1024, 64, 1536] - - [8, 5152.58] - - - [1536, 1536, 1, 64, 1536, 1536, 64, 1536] - - [6, 6800.96] - - - [2048, 1536, 1, 64, 2048, 2048, 64, 1536] - - [8, 8088.0] - - - [2560, 1536, 1, 64, 2560, 2560, 64, 1536] - - [2, 9133.94] - - - [3072, 1536, 1, 64, 3072, 3072, 64, 1536] - - [6, 10264.6] - - - [512, 2048, 1, 64, 512, 512, 64, 2048] - - [19, 3696.54] - - - [1024, 2048, 1, 64, 1024, 1024, 64, 2048] - - [1, 6298.34] - - - [1536, 2048, 1, 64, 1536, 1536, 64, 2048] - - [19, 7790.22] - - - [2048, 2048, 1, 64, 2048, 2048, 64, 2048] - - [7, 9700.44] - - - [2560, 2048, 1, 64, 2560, 2560, 64, 2048] - - [9, 10984.3] - - - [3072, 2048, 1, 64, 3072, 3072, 64, 2048] - - [9, 12022.0] - - - [512, 2560, 1, 64, 512, 512, 64, 2560] - - [20, 4391.71] - - - [1024, 2560, 1, 64, 1024, 1024, 64, 2560] - - [6, 7101.62] - - - [1536, 2560, 1, 64, 1536, 1536, 64, 2560] - - [1, 9360.37] - - - [2048, 2560, 1, 64, 2048, 2048, 64, 2560] - - [1, 11006.0] - - - [2560, 2560, 1, 64, 2560, 2560, 64, 2560] - - [8, 12365.6] - - - [3072, 2560, 1, 64, 3072, 3072, 64, 2560] - - [7, 13479.1] - - - [512, 3072, 1, 64, 512, 512, 64, 3072] - - [0, 4434.99] - - - [1024, 3072, 1, 64, 1024, 1024, 64, 3072] - - [6, 8175.2] - - - [1536, 3072, 1, 64, 1536, 1536, 64, 3072] - - [6, 10493.2] - - - [2048, 3072, 1, 64, 2048, 2048, 64, 3072] - - [7, 11785.7] - - - [2560, 3072, 1, 64, 2560, 2560, 64, 3072] - - [7, 13452.3] - - - [3072, 3072, 1, 64, 3072, 3072, 64, 3072] - - [6, 14621.4] - - - [512, 512, 1, 256, 512, 512, 256, 512] - - [13, 3950.6] - - - [1024, 512, 1, 256, 1024, 1024, 256, 512] - - [21, 7257.17] - - - [1536, 512, 1, 256, 1536, 1536, 256, 512] - - [5, 7936.24] - - - [2048, 512, 1, 256, 2048, 2048, 256, 512] - - [1, 11596.0] - - - [2560, 512, 1, 256, 2560, 2560, 256, 512] - - [21, 12681.2] - - - [3072, 512, 1, 256, 3072, 3072, 256, 512] - - [22, 13454.7] - - - [512, 1024, 1, 256, 512, 512, 256, 1024] - - [27, 5362.92] - - - [1024, 1024, 1, 256, 1024, 1024, 256, 1024] - - [21, 11518.9] - - - [1536, 1024, 1, 256, 1536, 1536, 256, 1024] - - [6, 14363.5] - - - [2048, 1024, 1, 256, 2048, 2048, 256, 1024] - - [6, 16690.9] - - - [2560, 1024, 1, 256, 2560, 2560, 256, 1024] - - [1, 18860.1] - - - [3072, 1024, 1, 256, 3072, 3072, 256, 1024] - - [6, 19744.2] - - - [512, 1536, 1, 256, 512, 512, 256, 1536] - - [19, 9025.27] - - - [1024, 1536, 1, 256, 1024, 1024, 256, 1536] - - [1, 14474.8] - - - [1536, 1536, 1, 256, 1536, 1536, 256, 1536] - - [6, 17097.1] - - - [2048, 1536, 1, 256, 2048, 2048, 256, 1536] - - [6, 20107.5] - - - [2560, 1536, 1, 256, 2560, 2560, 256, 1536] - - [6, 21992.8] - - - [3072, 1536, 1, 256, 3072, 3072, 256, 1536] - - [6, 23393.3] - - - [512, 2048, 1, 256, 512, 512, 256, 2048] - - [19, 11514.2] - - - [1024, 2048, 1, 256, 1024, 1024, 256, 2048] - - [6, 16835.1] - - - [1536, 2048, 1, 256, 1536, 1536, 256, 2048] - - [6, 20200.8] - - - [2048, 2048, 1, 256, 2048, 2048, 256, 2048] - - [6, 21977.2] - - - [2560, 2048, 1, 256, 2560, 2560, 256, 2048] - - [6, 24251.5] - - - [3072, 2048, 1, 256, 3072, 3072, 256, 2048] - - [6, 25706.9] - - - [512, 2560, 1, 256, 512, 512, 256, 2560] - - [6, 8237.86] - - - [1024, 2560, 1, 256, 1024, 1024, 256, 2560] - - [6, 18794.3] - - - [1536, 2560, 1, 256, 1536, 1536, 256, 2560] - - [6, 21764.1] - - - [2048, 2560, 1, 256, 2048, 2048, 256, 2560] - - [6, 23881.9] - - - [2560, 2560, 1, 256, 2560, 2560, 256, 2560] - - [21, 25509.3] - - - [3072, 2560, 1, 256, 3072, 3072, 256, 2560] - - [6, 27707.3] - - - [512, 3072, 1, 256, 512, 512, 256, 3072] - - [8, 14498.3] - - - [1024, 3072, 1, 256, 1024, 1024, 256, 3072] - - [6, 19817.3] - - - [1536, 3072, 1, 256, 1536, 1536, 256, 3072] - - [6, 22961.1] - - - [2048, 3072, 1, 256, 2048, 2048, 256, 3072] - - [6, 25392.4] - - - [2560, 3072, 1, 256, 2560, 2560, 256, 3072] - - [8, 27016.5] - - - [3072, 3072, 1, 256, 3072, 3072, 256, 3072] - - [8, 28086.5] - - - [512, 512, 1, 512, 512, 512, 512, 512] - - [26, 6889.5] - - - [1024, 512, 1, 512, 1024, 1024, 512, 512] - - [1, 12085.4] - - - [1536, 512, 1, 512, 1536, 1536, 512, 512] - - [14, 9874.52] - - - [2048, 512, 1, 512, 2048, 2048, 512, 512] - - [6, 17180.4] - - - [2560, 512, 1, 512, 2560, 2560, 512, 512] - - [6, 18934.8] - - - [3072, 512, 1, 512, 3072, 3072, 512, 512] - - [1, 21194.2] - - - [512, 1024, 1, 512, 512, 512, 512, 1024] - - [6, 12156.6] - - - [1024, 1024, 1, 512, 1024, 1024, 512, 1024] - - [21, 17796.6] - - - [1536, 1024, 1, 512, 1536, 1536, 512, 1024] - - [1, 21025.2] - - - [2048, 1024, 1, 512, 2048, 2048, 512, 1024] - - [6, 23354.1] - - - [2560, 1024, 1, 512, 2560, 2560, 512, 1024] - - [21, 24928.3] - - - [3072, 1024, 1, 512, 3072, 3072, 512, 1024] - - [1, 26813.2] - - - [512, 1536, 1, 512, 512, 512, 512, 1536] - - [21, 14516.6] - - - [1024, 1536, 1, 512, 1024, 1024, 512, 1536] - - [6, 21185.9] - - - [1536, 1536, 1, 512, 1536, 1536, 512, 1536] - - [6, 24108.8] - - - [2048, 1536, 1, 512, 2048, 2048, 512, 1536] - - [6, 26635.3] - - - [2560, 1536, 1, 512, 2560, 2560, 512, 1536] - - [21, 27782.2] - - - [3072, 1536, 1, 512, 3072, 3072, 512, 1536] - - [6, 29862.3] - - - [512, 2048, 1, 512, 512, 512, 512, 2048] - - [19, 12205.1] - - - [1024, 2048, 1, 512, 1024, 1024, 512, 2048] - - [6, 23129.8] - - - [1536, 2048, 1, 512, 1536, 1536, 512, 2048] - - [6, 26490.8] - - - [2048, 2048, 1, 512, 2048, 2048, 512, 2048] - - [6, 28155.0] - - - [2560, 2048, 1, 512, 2560, 2560, 512, 2048] - - [1, 30072.6] - - - [3072, 2048, 1, 512, 3072, 3072, 512, 2048] - - [1, 30680.4] - - - [512, 2560, 1, 512, 512, 512, 512, 2560] - - [6, 18757.3] - - - [1024, 2560, 1, 512, 1024, 1024, 512, 2560] - - [6, 19614.2] - - - [1536, 2560, 1, 512, 1536, 1536, 512, 2560] - - [6, 27576.2] - - - [2048, 2560, 1, 512, 2048, 2048, 512, 2560] - - [1, 30077.5] - - - [2560, 2560, 1, 512, 2560, 2560, 512, 2560] - - [6, 31499.1] - - - [3072, 2560, 1, 512, 3072, 3072, 512, 2560] - - [6, 32458.2] - - - [512, 3072, 1, 512, 512, 512, 512, 3072] - - [21, 21039.2] - - - [1024, 3072, 1, 512, 1024, 1024, 512, 3072] - - [1, 26199.0] - - - [1536, 3072, 1, 512, 1536, 1536, 512, 3072] - - [6, 29460.8] - - - [2048, 3072, 1, 512, 2048, 2048, 512, 3072] - - [1, 30827.6] - - - [2560, 3072, 1, 512, 2560, 2560, 512, 3072] - - [1, 31993.0] - - - [3072, 3072, 1, 512, 3072, 3072, 512, 3072] - - [6, 33276.4] - - - [512, 512, 1, 1024, 512, 512, 1024, 512] - - [12, 10971.8] - - - [1024, 512, 1, 1024, 1024, 1024, 1024, 512] - - [21, 18310.7] - - - [1536, 512, 1, 1024, 1536, 1536, 1024, 512] - - [1, 19266.4] - - - [2048, 512, 1, 1024, 2048, 2048, 1024, 512] - - [19, 18428.7] - - - [2560, 512, 1, 1024, 2560, 2560, 1024, 512] - - [1, 24497.7] - - - [3072, 512, 1, 1024, 3072, 3072, 1024, 512] - - [1, 26869.1] - - - [512, 1024, 1, 1024, 512, 512, 1024, 1024] - - [21, 18273.3] - - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] - - [6, 24337.7] - - - [1536, 1024, 1, 1024, 1536, 1536, 1024, 1024] - - [19, 26873.8] - - - [2048, 1024, 1, 1024, 2048, 2048, 1024, 1024] - - [19, 29022.4] - - - [2560, 1024, 1, 1024, 2560, 2560, 1024, 1024] - - [19, 30283.1] - - - [3072, 1024, 1, 1024, 3072, 3072, 1024, 1024] - - [1, 31153.2] - - - [512, 1536, 1, 1024, 512, 512, 1024, 1536] - - [6, 20246.8] - - - [1024, 1536, 1, 1024, 1024, 1024, 1024, 1536] - - [1, 27449.5] - - - [1536, 1536, 1, 1024, 1536, 1536, 1024, 1536] - - [21, 28920.3] - - - [2048, 1536, 1, 1024, 2048, 2048, 1024, 1536] - - [6, 31424.3] - - - [2560, 1536, 1, 1024, 2560, 2560, 1024, 1536] - - [1, 31814.4] - - - [3072, 1536, 1, 1024, 3072, 3072, 1024, 1536] - - [1, 33218.0] - - - [512, 2048, 1, 1024, 512, 512, 1024, 2048] - - [21, 24365.6] - - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 2048] - - [21, 29038.1] - - - [1536, 2048, 1, 1024, 1536, 1536, 1024, 2048] - - [1, 31145.5] - - - [2048, 2048, 1, 1024, 2048, 2048, 1024, 2048] - - [19, 32352.7] - - - [2560, 2048, 1, 1024, 2560, 2560, 1024, 2048] - - [19, 33927.1] - - - [3072, 2048, 1, 1024, 3072, 3072, 1024, 2048] - - [1, 33613.0] - - - [512, 2560, 1, 1024, 512, 512, 1024, 2560] - - [6, 24540.4] - - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 2560] - - [22, 29472.2] - - - [1536, 2560, 1, 1024, 1536, 1536, 1024, 2560] - - [19, 32023.7] - - - [2048, 2560, 1, 1024, 2048, 2048, 1024, 2560] - - [6, 33480.9] - - - [2560, 2560, 1, 1024, 2560, 2560, 1024, 2560] - - [1, 33714.7] - - - [3072, 2560, 1, 1024, 3072, 3072, 1024, 2560] - - [21, 34741.4] - - - [512, 3072, 1, 1024, 512, 512, 1024, 3072] - - [1, 27182.6] - - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 3072] - - [6, 31539.8] - - - [1536, 3072, 1, 1024, 1536, 1536, 1024, 3072] - - [21, 33705.5] - - - [2048, 3072, 1, 1024, 2048, 2048, 1024, 3072] - - [19, 34361.0] - - - [2560, 3072, 1, 1024, 2560, 2560, 1024, 3072] - - [19, 35487.5] - - - [3072, 3072, 1, 1024, 3072, 3072, 1024, 3072] - - [8, 35326.3] - - - [512, 512, 1, 2048, 512, 512, 2048, 512] - - [28, 15679.9] - - - [1024, 512, 1, 2048, 1024, 1024, 2048, 512] - - [21, 24296.6] - - - [1536, 512, 1, 2048, 1536, 1536, 2048, 512] - - [21, 24278.2] - - - [2048, 512, 1, 2048, 2048, 2048, 2048, 512] - - [6, 29739.2] - - - [2560, 512, 1, 2048, 2560, 2560, 2048, 512] - - [6, 28423.0] - - - [3072, 512, 1, 2048, 3072, 3072, 2048, 512] - - [21, 31795.6] - - - [512, 1024, 1, 2048, 512, 512, 2048, 1024] - - [21, 24437.9] - - - [1024, 1024, 1, 2048, 1024, 1024, 2048, 1024] - - [6, 29380.4] - - - [1536, 1024, 1, 2048, 1536, 1536, 2048, 1024] - - [21, 31789.3] - - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 1024] - - [20, 31191.0] - - - [2560, 1024, 1, 2048, 2560, 2560, 2048, 1024] - - [21, 33912.0] - - - [3072, 1024, 1, 2048, 3072, 3072, 2048, 1024] - - [6, 35255.0] - - - [512, 1536, 1, 2048, 512, 512, 2048, 1536] - - [6, 24873.6] - - - [1024, 1536, 1, 2048, 1024, 1024, 2048, 1536] - - [6, 31882.3] - - - [1536, 1536, 1, 2048, 1536, 1536, 2048, 1536] - - [21, 31570.7] - - - [2048, 1536, 1, 2048, 2048, 2048, 2048, 1536] - - [21, 34462.4] - - - [2560, 1536, 1, 2048, 2560, 2560, 2048, 1536] - - [21, 34598.5] - - - [3072, 1536, 1, 2048, 3072, 3072, 2048, 1536] - - [6, 35573.9] - - - [512, 2048, 1, 2048, 512, 512, 2048, 2048] - - [6, 29667.1] - - - [1024, 2048, 1, 2048, 1024, 1024, 2048, 2048] - - [6, 33209.9] - - - [1536, 2048, 1, 2048, 1536, 1536, 2048, 2048] - - [6, 34438.3] - - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] - - [6, 35818.1] - - - [2560, 2048, 1, 2048, 2560, 2560, 2048, 2048] - - [6, 36040.1] - - - [3072, 2048, 1, 2048, 3072, 3072, 2048, 2048] - - [6, 36782.1] - - - [512, 2560, 1, 2048, 512, 512, 2048, 2560] - - [22, 27572.0] - - - [1024, 2560, 1, 2048, 1024, 1024, 2048, 2560] - - [1, 33165.8] - - - [1536, 2560, 1, 2048, 1536, 1536, 2048, 2560] - - [6, 34165.4] - - - [2048, 2560, 1, 2048, 2048, 2048, 2048, 2560] - - [6, 35669.4] - - - [2560, 2560, 1, 2048, 2560, 2560, 2048, 2560] - - [21, 35789.5] - - - [3072, 2560, 1, 2048, 3072, 3072, 2048, 2560] - - [6, 36272.2] - - - [512, 3072, 1, 2048, 512, 512, 2048, 3072] - - [19, 31784.6] - - - [1024, 3072, 1, 2048, 1024, 1024, 2048, 3072] - - [6, 35267.6] - - - [1536, 3072, 1, 2048, 1536, 1536, 2048, 3072] - - [6, 35707.6] - - - [2048, 3072, 1, 2048, 2048, 2048, 2048, 3072] - - [21, 35920.7] - - - [2560, 3072, 1, 2048, 2560, 2560, 2048, 3072] - - [21, 36186.1] - - - [3072, 3072, 1, 2048, 3072, 3072, 2048, 3072] - - [6, 36613.0] - - - [512, 512, 1, 3072, 512, 512, 3072, 512] - - [28, 18056.6] - - - [1024, 512, 1, 3072, 1024, 1024, 3072, 512] - - [6, 27553.0] - - - [1536, 512, 1, 3072, 1536, 1536, 3072, 512] - - [19, 26363.9] - - - [2048, 512, 1, 3072, 2048, 2048, 3072, 512] - - [6, 32272.8] - - - [2560, 512, 1, 3072, 2560, 2560, 3072, 512] - - [19, 30756.5] - - - [3072, 512, 1, 3072, 3072, 3072, 3072, 512] - - [21, 33875.9] - - - [512, 1024, 1, 3072, 512, 512, 3072, 1024] - - [21, 27610.0] - - - [1024, 1024, 1, 3072, 1024, 1024, 3072, 1024] - - [21, 31679.6] - - - [1536, 1024, 1, 3072, 1536, 1536, 3072, 1024] - - [1, 33794.0] - - - [2048, 1024, 1, 3072, 2048, 2048, 3072, 1024] - - [6, 34582.0] - - - [2560, 1024, 1, 3072, 2560, 2560, 3072, 1024] - - [6, 34984.0] - - - [3072, 1024, 1, 3072, 3072, 3072, 3072, 1024] - - [6, 36031.0] - - - [512, 1536, 1, 3072, 512, 512, 3072, 1536] - - [6, 23558.9] - - - [1024, 1536, 1, 3072, 1024, 1024, 3072, 1536] - - [1, 33291.3] - - - [1536, 1536, 1, 3072, 1536, 1536, 3072, 1536] - - [8, 33039.3] - - - [2048, 1536, 1, 3072, 2048, 2048, 3072, 1536] - - [21, 35348.4] - - - [2560, 1536, 1, 3072, 2560, 2560, 3072, 1536] - - [1, 35120.0] - - - [3072, 1536, 1, 3072, 3072, 3072, 3072, 1536] - - [21, 35699.4] - - - [512, 2048, 1, 3072, 512, 512, 3072, 2048] - - [1, 32285.7] - - - [1024, 2048, 1, 3072, 1024, 1024, 3072, 2048] - - [6, 34666.7] - - - [1536, 2048, 1, 3072, 1536, 1536, 3072, 2048] - - [6, 35985.3] - - - [2048, 2048, 1, 3072, 2048, 2048, 3072, 2048] - - [21, 35977.0] - - - [2560, 2048, 1, 3072, 2560, 2560, 3072, 2048] - - [6, 37045.4] - - - [3072, 2048, 1, 3072, 3072, 3072, 3072, 2048] - - [1, 36987.5] - - - [512, 2560, 1, 3072, 512, 512, 3072, 2560] - - [19, 29945.3] - - - [1024, 2560, 1, 3072, 1024, 1024, 3072, 2560] - - [19, 35250.2] - - - [1536, 2560, 1, 3072, 1536, 1536, 3072, 2560] - - [6, 35889.4] - - - [2048, 2560, 1, 3072, 2048, 2048, 3072, 2560] - - [21, 36970.0] - - - [2560, 2560, 1, 3072, 2560, 2560, 3072, 2560] - - [8, 36851.2] - - - [3072, 2560, 1, 3072, 3072, 3072, 3072, 2560] - - [21, 36784.3] - - - [512, 3072, 1, 3072, 512, 512, 3072, 3072] - - [6, 33508.8] - - - [1024, 3072, 1, 3072, 1024, 1024, 3072, 3072] - - [19, 36071.5] - - - [1536, 3072, 1, 3072, 1536, 1536, 3072, 3072] - - [6, 36754.1] - - - [2048, 3072, 1, 3072, 2048, 2048, 3072, 3072] - - [6, 37061.7] - - - [2560, 3072, 1, 3072, 2560, 2560, 3072, 3072] - - [6, 36768.6] - - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] - - [6, 37017.4] - - - [1, 1, 1, 1, 1, 1, 1, 1] - - [15, 6.66533e-05] - - - [1, 1, 1, 64, 1, 1, 64, 1] - - [15, 0.00412252] - - - [1, 64, 1, 1, 1, 1, 1, 64] - - [16, 0.00426852] - - - [64, 1, 1, 1, 64, 64, 1, 1] - - [25, 0.00407916] - - - [64, 64, 1, 1, 64, 64, 1, 64] - - [17, 0.266164] - - - [64, 1, 1, 64, 64, 64, 64, 1] - - [24, 0.271564] - - - [1, 64, 1, 64, 1, 1, 64, 64] - - [18, 0.270658] - - - [64, 64, 1, 64, 64, 64, 64, 64] - - [11, 17.2707] - - - [64, 64, 1, 256, 64, 64, 256, 64] - - [11, 63.0286] - - - [64, 64, 1, 512, 64, 64, 512, 64] - - [10, 115.644] - - - [64, 64, 1, 1024, 64, 64, 1024, 64] - - [24, 203.124] - - - [64, 64, 1, 2048, 64, 64, 2048, 64] - - [10, 317.817] - - - [64, 64, 1, 4096, 64, 64, 4096, 64] - - [10, 442.484] -- null -- null -- DeviceEfficiency -- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Alik_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Alik_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml deleted file mode 100644 index 36ee2e984fc..00000000000 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Alik_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml +++ /dev/null @@ -1,9086 +0,0 @@ -- {MinimumRequiredVersion: 4.33.0} -- gfx1201 -- gfx1201 -- [Device 73f0] -- Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false -- - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 0 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 7 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 8 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 9 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 10 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR3_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 11 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 12 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG64_2_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 13 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG64_2_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [64, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 14 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 15 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 16 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 17 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 18 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 19 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 20 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 9472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 9472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 21 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 9472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 9472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 22 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 23 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR1_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 24 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 25 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 26 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR3_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 27 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 28 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 31744 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 26624 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 26624 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 29 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR3_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 31744 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 26624 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 26624 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 30 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR3_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 31744 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 10240 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 31 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 31744 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 10240 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 32 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 -- [2, 3, 0, 1] -- - - [512, 512, 1, 64, 512, 512, 64, 64] - - [26, 1116.36] - - - [1024, 512, 1, 64, 1024, 1024, 64, 64] - - [24, 2109.68] - - - [1536, 512, 1, 64, 1536, 1536, 64, 64] - - [6, 2920.74] - - - [2048, 512, 1, 64, 2048, 2048, 64, 64] - - [24, 3812.35] - - - [2560, 512, 1, 64, 2560, 2560, 64, 64] - - [9, 4280.56] - - - [3072, 512, 1, 64, 3072, 3072, 64, 64] - - [7, 4977.66] - - - [512, 1024, 1, 64, 512, 512, 64, 64] - - [20, 2015.7] - - - [1024, 1024, 1, 64, 1024, 1024, 64, 64] - - [5, 3695.63] - - - [1536, 1024, 1, 64, 1536, 1536, 64, 64] - - [6, 5176.29] - - - [2048, 1024, 1, 64, 2048, 2048, 64, 64] - - [26, 5979.18] - - - [2560, 1024, 1, 64, 2560, 2560, 64, 64] - - [7, 7139.54] - - - [3072, 1024, 1, 64, 3072, 3072, 64, 64] - - [7, 8016.99] - - - [512, 1536, 1, 64, 512, 512, 64, 64] - - [4, 2992.1] - - - [1024, 1536, 1, 64, 1024, 1024, 64, 64] - - [8, 4945.87] - - - [1536, 1536, 1, 64, 1536, 1536, 64, 64] - - [23, 6493.29] - - - [2048, 1536, 1, 64, 2048, 2048, 64, 64] - - [3, 8039.4] - - - [2560, 1536, 1, 64, 2560, 2560, 64, 64] - - [3, 9563.3] - - - [3072, 1536, 1, 64, 3072, 3072, 64, 64] - - [7, 10179.8] - - - [512, 2048, 1, 64, 512, 512, 64, 64] - - [9, 3660.25] - - - [1024, 2048, 1, 64, 1024, 1024, 64, 64] - - [3, 6320.59] - - - [1536, 2048, 1, 64, 1536, 1536, 64, 64] - - [3, 8217.08] - - - [2048, 2048, 1, 64, 2048, 2048, 64, 64] - - [3, 9524.73] - - - [2560, 2048, 1, 64, 2560, 2560, 64, 64] - - [3, 11184.3] - - - [3072, 2048, 1, 64, 3072, 3072, 64, 64] - - [3, 11892.1] - - - [512, 2560, 1, 64, 512, 512, 64, 64] - - [4, 4524.6] - - - [1024, 2560, 1, 64, 1024, 1024, 64, 64] - - [3, 7363.92] - - - [1536, 2560, 1, 64, 1536, 1536, 64, 64] - - [7, 9451.78] - - - [2048, 2560, 1, 64, 2048, 2048, 64, 64] - - [3, 11067.7] - - - [2560, 2560, 1, 64, 2560, 2560, 64, 64] - - [4, 12345.4] - - - [3072, 2560, 1, 64, 3072, 3072, 64, 64] - - [3, 13638.3] - - - [512, 3072, 1, 64, 512, 512, 64, 64] - - [5, 5016.11] - - - [1024, 3072, 1, 64, 1024, 1024, 64, 64] - - [6, 7863.24] - - - [1536, 3072, 1, 64, 1536, 1536, 64, 64] - - [8, 10419.2] - - - [2048, 3072, 1, 64, 2048, 2048, 64, 64] - - [7, 12251.0] - - - [2560, 3072, 1, 64, 2560, 2560, 64, 64] - - [8, 13673.5] - - - [3072, 3072, 1, 64, 3072, 3072, 64, 64] - - [3, 14721.3] - - - [512, 512, 1, 256, 512, 512, 256, 256] - - [23, 3824.3] - - - [1024, 512, 1, 256, 1024, 1024, 256, 256] - - [21, 6310.04] - - - [1536, 512, 1, 256, 1536, 1536, 256, 256] - - [3, 9379.08] - - - [2048, 512, 1, 256, 2048, 2048, 256, 256] - - [7, 11175.5] - - - [2560, 512, 1, 256, 2560, 2560, 256, 256] - - [3, 12621.6] - - - [3072, 512, 1, 256, 3072, 3072, 256, 256] - - [7, 14480.3] - - - [512, 1024, 1, 256, 512, 512, 256, 256] - - [7, 7320.5] - - - [1024, 1024, 1, 256, 1024, 1024, 256, 256] - - [7, 11689.9] - - - [1536, 1024, 1, 256, 1536, 1536, 256, 256] - - [32, 11671.1] - - - [2048, 1024, 1, 256, 2048, 2048, 256, 256] - - [7, 16930.9] - - - [2560, 1024, 1, 256, 2560, 2560, 256, 256] - - [3, 18384.2] - - - [3072, 1024, 1, 256, 3072, 3072, 256, 256] - - [3, 20444.7] - - - [512, 1536, 1, 256, 512, 512, 256, 256] - - [3, 9420.77] - - - [1024, 1536, 1, 256, 1024, 1024, 256, 256] - - [7, 14561.4] - - - [1536, 1536, 1, 256, 1536, 1536, 256, 256] - - [7, 17782.7] - - - [2048, 1536, 1, 256, 2048, 2048, 256, 256] - - [7, 20398.1] - - - [2560, 1536, 1, 256, 2560, 2560, 256, 256] - - [7, 22130.6] - - - [3072, 1536, 1, 256, 3072, 3072, 256, 256] - - [7, 23282.6] - - - [512, 2048, 1, 256, 512, 512, 256, 256] - - [23, 11511.4] - - - [1024, 2048, 1, 256, 1024, 1024, 256, 256] - - [7, 17019.7] - - - [1536, 2048, 1, 256, 1536, 1536, 256, 256] - - [7, 20531.0] - - - [2048, 2048, 1, 256, 2048, 2048, 256, 256] - - [7, 22318.2] - - - [2560, 2048, 1, 256, 2560, 2560, 256, 256] - - [3, 24464.2] - - - [3072, 2048, 1, 256, 3072, 3072, 256, 256] - - [7, 26127.0] - - - [512, 2560, 1, 256, 512, 512, 256, 256] - - [23, 12787.8] - - - [1024, 2560, 1, 256, 1024, 1024, 256, 256] - - [23, 17516.4] - - - [1536, 2560, 1, 256, 1536, 1536, 256, 256] - - [7, 22099.0] - - - [2048, 2560, 1, 256, 2048, 2048, 256, 256] - - [3, 24493.2] - - - [2560, 2560, 1, 256, 2560, 2560, 256, 256] - - [23, 26039.0] - - - [3072, 2560, 1, 256, 3072, 3072, 256, 256] - - [7, 27133.3] - - - [512, 3072, 1, 256, 512, 512, 256, 256] - - [31, 11673.0] - - - [1024, 3072, 1, 256, 1024, 1024, 256, 256] - - [7, 20336.3] - - - [1536, 3072, 1, 256, 1536, 1536, 256, 256] - - [7, 23677.8] - - - [2048, 3072, 1, 256, 2048, 2048, 256, 256] - - [23, 25872.3] - - - [2560, 3072, 1, 256, 2560, 2560, 256, 256] - - [3, 27085.7] - - - [3072, 3072, 1, 256, 3072, 3072, 256, 256] - - [3, 28148.9] - - - [512, 512, 1, 512, 512, 512, 512, 512] - - [29, 6901.9] - - - [1024, 512, 1, 512, 1024, 1024, 512, 512] - - [22, 10023.5] - - - [1536, 512, 1, 512, 1536, 1536, 512, 512] - - [3, 14241.6] - - - [2048, 512, 1, 512, 2048, 2048, 512, 512] - - [7, 17210.7] - - - [2560, 512, 1, 512, 2560, 2560, 512, 512] - - [7, 18333.8] - - - [3072, 512, 1, 512, 3072, 3072, 512, 512] - - [18, 20924.1] - - - [512, 1024, 1, 512, 512, 512, 512, 512] - - [18, 11163.9] - - - [1024, 1024, 1, 512, 1024, 1024, 512, 512] - - [3, 17257.7] - - - [1536, 1024, 1, 512, 1536, 1536, 512, 512] - - [3, 21191.4] - - - [2048, 1024, 1, 512, 2048, 2048, 512, 512] - - [3, 23518.1] - - - [2560, 1024, 1, 512, 2560, 2560, 512, 512] - - [7, 24712.4] - - - [3072, 1024, 1, 512, 3072, 3072, 512, 512] - - [3, 26455.7] - - - [512, 1536, 1, 512, 512, 512, 512, 512] - - [7, 14173.7] - - - [1024, 1536, 1, 512, 1024, 1024, 512, 512] - - [3, 21272.9] - - - [1536, 1536, 1, 512, 1536, 1536, 512, 512] - - [7, 23413.7] - - - [2048, 1536, 1, 512, 2048, 2048, 512, 512] - - [7, 26629.0] - - - [2560, 1536, 1, 512, 2560, 2560, 512, 512] - - [3, 27606.6] - - - [3072, 1536, 1, 512, 3072, 3072, 512, 512] - - [3, 29315.8] - - - [512, 2048, 1, 512, 512, 512, 512, 512] - - [3, 17921.7] - - - [1024, 2048, 1, 512, 1024, 1024, 512, 512] - - [7, 23634.6] - - - [1536, 2048, 1, 512, 1536, 1536, 512, 512] - - [3, 26315.1] - - - [2048, 2048, 1, 512, 2048, 2048, 512, 512] - - [3, 28469.1] - - - [2560, 2048, 1, 512, 2560, 2560, 512, 512] - - [18, 30072.4] - - - [3072, 2048, 1, 512, 3072, 3072, 512, 512] - - [23, 31016.6] - - - [512, 2560, 1, 512, 512, 512, 512, 512] - - [3, 18699.8] - - - [1024, 2560, 1, 512, 1024, 1024, 512, 512] - - [3, 24838.3] - - - [1536, 2560, 1, 512, 1536, 1536, 512, 512] - - [3, 27568.4] - - - [2048, 2560, 1, 512, 2048, 2048, 512, 512] - - [3, 30484.8] - - - [2560, 2560, 1, 512, 2560, 2560, 512, 512] - - [7, 31541.9] - - - [3072, 2560, 1, 512, 3072, 3072, 512, 512] - - [23, 31976.4] - - - [512, 3072, 1, 512, 512, 512, 512, 512] - - [25, 20206.2] - - - [1024, 3072, 1, 512, 1024, 1024, 512, 512] - - [3, 26139.5] - - - [1536, 3072, 1, 512, 1536, 1536, 512, 512] - - [7, 29248.4] - - - [2048, 3072, 1, 512, 2048, 2048, 512, 512] - - [3, 30635.2] - - - [2560, 3072, 1, 512, 2560, 2560, 512, 512] - - [7, 31928.0] - - - [3072, 3072, 1, 512, 3072, 3072, 512, 512] - - [3, 32401.2] - - - [512, 512, 1, 1024, 512, 512, 1024, 1024] - - [16, 11191.8] - - - [1024, 512, 1, 1024, 1024, 1024, 1024, 1024] - - [30, 15094.6] - - - [1536, 512, 1, 1024, 1536, 1536, 1024, 1024] - - [7, 20115.1] - - - [2048, 512, 1, 1024, 2048, 2048, 1024, 1024] - - [1, 18401.7] - - - [2560, 512, 1, 1024, 2560, 2560, 1024, 1024] - - [3, 24437.0] - - - [3072, 512, 1, 1024, 3072, 3072, 1024, 1024] - - [3, 27036.4] - - - [512, 1024, 1, 1024, 512, 512, 1024, 1024] - - [23, 18759.2] - - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] - - [7, 23930.3] - - - [1536, 1024, 1, 1024, 1536, 1536, 1024, 1024] - - [3, 27311.9] - - - [2048, 1024, 1, 1024, 2048, 2048, 1024, 1024] - - [3, 29234.0] - - - [2560, 1024, 1, 1024, 2560, 2560, 1024, 1024] - - [18, 30887.0] - - - [3072, 1024, 1, 1024, 3072, 3072, 1024, 1024] - - [3, 31271.5] - - - [512, 1536, 1, 1024, 512, 512, 1024, 1024] - - [3, 15196.0] - - - [1024, 1536, 1, 1024, 1024, 1024, 1024, 1024] - - [7, 27468.2] - - - [1536, 1536, 1, 1024, 1536, 1536, 1024, 1024] - - [9, 28538.7] - - - [2048, 1536, 1, 1024, 2048, 2048, 1024, 1024] - - [3, 31260.8] - - - [2560, 1536, 1, 1024, 2560, 2560, 1024, 1024] - - [18, 31617.8] - - - [3072, 1536, 1, 1024, 3072, 3072, 1024, 1024] - - [3, 33265.9] - - - [512, 2048, 1, 1024, 512, 512, 1024, 1024] - - [7, 24437.9] - - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 1024] - - [23, 29042.0] - - - [1536, 2048, 1, 1024, 1536, 1536, 1024, 1024] - - [3, 31309.3] - - - [2048, 2048, 1, 1024, 2048, 2048, 1024, 1024] - - [3, 32573.8] - - - [2560, 2048, 1, 1024, 2560, 2560, 1024, 1024] - - [7, 33352.6] - - - [3072, 2048, 1, 1024, 3072, 3072, 1024, 1024] - - [18, 33978.9] - - - [512, 2560, 1, 1024, 512, 512, 1024, 1024] - - [3, 24450.6] - - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 1024] - - [7, 30538.4] - - - [1536, 2560, 1, 1024, 1536, 1536, 1024, 1024] - - [18, 31676.3] - - - [2048, 2560, 1, 1024, 2048, 2048, 1024, 1024] - - [18, 33474.4] - - - [2560, 2560, 1, 1024, 2560, 2560, 1024, 1024] - - [25, 34230.7] - - - [3072, 2560, 1, 1024, 3072, 3072, 1024, 1024] - - [18, 34591.6] - - - [512, 3072, 1, 1024, 512, 512, 1024, 1024] - - [7, 27123.1] - - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 1024] - - [25, 30137.2] - - - [1536, 3072, 1, 1024, 1536, 1536, 1024, 1024] - - [18, 33263.7] - - - [2048, 3072, 1, 1024, 2048, 2048, 1024, 1024] - - [3, 33801.1] - - - [2560, 3072, 1, 1024, 2560, 2560, 1024, 1024] - - [3, 35197.0] - - - [3072, 3072, 1, 1024, 3072, 3072, 1024, 1024] - - [3, 35739.8] - - - [512, 512, 1, 2048, 512, 512, 2048, 2048] - - [0, 16070.1] - - - [1024, 512, 1, 2048, 1024, 1024, 2048, 2048] - - [7, 24839.9] - - - [1536, 512, 1, 2048, 1536, 1536, 2048, 2048] - - [7, 24644.6] - - - [2048, 512, 1, 2048, 2048, 2048, 2048, 2048] - - [23, 29724.8] - - - [2560, 512, 1, 2048, 2560, 2560, 2048, 2048] - - [7, 28576.3] - - - [3072, 512, 1, 2048, 3072, 3072, 2048, 2048] - - [7, 31765.6] - - - [512, 1024, 1, 2048, 512, 512, 2048, 2048] - - [30, 19501.7] - - - [1024, 1024, 1, 2048, 1024, 1024, 2048, 2048] - - [3, 29849.0] - - - [1536, 1024, 1, 2048, 1536, 1536, 2048, 2048] - - [7, 31696.9] - - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 2048] - - [7, 33441.7] - - - [2560, 1024, 1, 2048, 2560, 2560, 2048, 2048] - - [18, 33752.7] - - - [3072, 1024, 1, 2048, 3072, 3072, 2048, 2048] - - [7, 34739.8] - - - [512, 1536, 1, 2048, 512, 512, 2048, 2048] - - [7, 25223.0] - - - [1024, 1536, 1, 2048, 1024, 1024, 2048, 2048] - - [3, 32174.0] - - - [1536, 1536, 1, 2048, 1536, 1536, 2048, 2048] - - [23, 31646.4] - - - [2048, 1536, 1, 2048, 2048, 2048, 2048, 2048] - - [3, 34694.8] - - - [2560, 1536, 1, 2048, 2560, 2560, 2048, 2048] - - [3, 34553.2] - - - [3072, 1536, 1, 2048, 3072, 3072, 2048, 2048] - - [23, 35759.2] - - - [512, 2048, 1, 2048, 512, 512, 2048, 2048] - - [3, 29652.8] - - - [1024, 2048, 1, 2048, 1024, 1024, 2048, 2048] - - [7, 33297.5] - - - [1536, 2048, 1, 2048, 1536, 1536, 2048, 2048] - - [7, 34608.8] - - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] - - [23, 35139.7] - - - [2560, 2048, 1, 2048, 2560, 2560, 2048, 2048] - - [23, 35341.3] - - - [3072, 2048, 1, 2048, 3072, 3072, 2048, 2048] - - [23, 35956.2] - - - [512, 2560, 1, 2048, 512, 512, 2048, 2048] - - [3, 28258.1] - - - [1024, 2560, 1, 2048, 1024, 1024, 2048, 2048] - - [7, 33681.7] - - - [1536, 2560, 1, 2048, 1536, 1536, 2048, 2048] - - [3, 34214.0] - - - [2048, 2560, 1, 2048, 2048, 2048, 2048, 2048] - - [3, 35566.9] - - - [2560, 2560, 1, 2048, 2560, 2560, 2048, 2048] - - [3, 34975.4] - - - [3072, 2560, 1, 2048, 3072, 3072, 2048, 2048] - - [7, 36457.3] - - - [512, 3072, 1, 2048, 512, 512, 2048, 2048] - - [23, 32028.3] - - - [1024, 3072, 1, 2048, 1024, 1024, 2048, 2048] - - [3, 35102.8] - - - [1536, 3072, 1, 2048, 1536, 1536, 2048, 2048] - - [23, 35622.4] - - - [2048, 3072, 1, 2048, 2048, 2048, 2048, 2048] - - [23, 36506.3] - - - [2560, 3072, 1, 2048, 2560, 2560, 2048, 2048] - - [23, 36578.4] - - - [3072, 3072, 1, 2048, 3072, 3072, 2048, 2048] - - [7, 36227.1] - - - [512, 512, 1, 3072, 512, 512, 3072, 3072] - - [16, 19097.1] - - - [1024, 512, 1, 3072, 1024, 1024, 3072, 3072] - - [7, 22485.2] - - - [1536, 512, 1, 3072, 1536, 1536, 3072, 3072] - - [18, 27063.8] - - - [2048, 512, 1, 3072, 2048, 2048, 3072, 3072] - - [23, 31945.5] - - - [2560, 512, 1, 3072, 2560, 2560, 3072, 3072] - - [25, 30201.7] - - - [3072, 512, 1, 3072, 3072, 3072, 3072, 3072] - - [7, 33670.1] - - - [512, 1024, 1, 3072, 512, 512, 3072, 3072] - - [23, 27764.6] - - - [1024, 1024, 1, 3072, 1024, 1024, 3072, 3072] - - [18, 31934.4] - - - [1536, 1024, 1, 3072, 1536, 1536, 3072, 3072] - - [18, 33879.6] - - - [2048, 1024, 1, 3072, 2048, 2048, 3072, 3072] - - [7, 34652.8] - - - [2560, 1024, 1, 3072, 2560, 2560, 3072, 3072] - - [7, 35678.7] - - - [3072, 1024, 1, 3072, 3072, 3072, 3072, 3072] - - [7, 36476.6] - - - [512, 1536, 1, 3072, 512, 512, 3072, 3072] - - [19, 26411.6] - - - [1024, 1536, 1, 3072, 1024, 1024, 3072, 3072] - - [18, 33941.6] - - - [1536, 1536, 1, 3072, 1536, 1536, 3072, 3072] - - [9, 33518.4] - - - [2048, 1536, 1, 3072, 2048, 2048, 3072, 3072] - - [7, 35550.4] - - - [2560, 1536, 1, 3072, 2560, 2560, 3072, 3072] - - [19, 35037.3] - - - [3072, 1536, 1, 3072, 3072, 3072, 3072, 3072] - - [7, 36761.4] - - - [512, 2048, 1, 3072, 512, 512, 3072, 3072] - - [23, 32025.2] - - - [1024, 2048, 1, 3072, 1024, 1024, 3072, 3072] - - [23, 34942.8] - - - [1536, 2048, 1, 3072, 1536, 1536, 3072, 3072] - - [7, 36511.0] - - - [2048, 2048, 1, 3072, 2048, 2048, 3072, 3072] - - [7, 36512.1] - - - [2560, 2048, 1, 3072, 2560, 2560, 3072, 3072] - - [3, 36582.2] - - - [3072, 2048, 1, 3072, 3072, 3072, 3072, 3072] - - [3, 36507.9] - - - [512, 2560, 1, 3072, 512, 512, 3072, 3072] - - [9, 30497.2] - - - [1024, 2560, 1, 3072, 1024, 1024, 3072, 3072] - - [23, 35525.0] - - - [1536, 2560, 1, 3072, 1536, 1536, 3072, 3072] - - [9, 35295.7] - - - [2048, 2560, 1, 3072, 2048, 2048, 3072, 3072] - - [23, 36415.3] - - - [2560, 2560, 1, 3072, 2560, 2560, 3072, 3072] - - [3, 36079.3] - - - [3072, 2560, 1, 3072, 3072, 3072, 3072, 3072] - - [7, 36791.5] - - - [512, 3072, 1, 3072, 512, 512, 3072, 3072] - - [23, 33847.3] - - - [1024, 3072, 1, 3072, 1024, 1024, 3072, 3072] - - [3, 35738.6] - - - [1536, 3072, 1, 3072, 1536, 1536, 3072, 3072] - - [23, 36281.0] - - - [2048, 3072, 1, 3072, 2048, 2048, 3072, 3072] - - [3, 36700.7] - - - [2560, 3072, 1, 3072, 2560, 2560, 3072, 3072] - - [3, 36778.3] - - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] - - [3, 37457.0] - - - [1, 1, 1, 1, 1, 1, 1, 1] - - [17, 6.65624e-05] - - - [1, 1, 1, 64, 1, 1, 64, 64] - - [21, 0.00424868] - - - [1, 64, 1, 1, 1, 1, 1, 1] - - [13, 0.00424305] - - - [64, 1, 1, 1, 64, 64, 1, 1] - - [15, 0.00427565] - - - [64, 64, 1, 1, 64, 64, 1, 1] - - [2, 0.269944] - - - [64, 1, 1, 64, 64, 64, 64, 64] - - [10, 0.250505] - - - [1, 64, 1, 64, 1, 1, 64, 64] - - [28, 0.271195] - - - [64, 64, 1, 64, 64, 64, 64, 64] - - [12, 17.4611] - - - [64, 64, 1, 256, 64, 64, 256, 256] - - [11, 64.9052] - - - [64, 64, 1, 512, 64, 64, 512, 512] - - [14, 118.493] - - - [64, 64, 1, 1024, 64, 64, 1024, 1024] - - [11, 203.869] - - - [64, 64, 1, 2048, 64, 64, 2048, 2048] - - [11, 322.713] - - - [64, 64, 1, 4096, 64, 64, 4096, 4096] - - [27, 447.864] -- null -- null -- DeviceEfficiency -- GridBased