From 67dae2fdea2f938ffa39a6b09abafc07b010413a Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Fri, 7 Nov 2025 07:31:41 +0000 Subject: [PATCH 1/4] port kernel --- ...k_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml | 77 +++++++++---------- 1 file changed, 35 insertions(+), 42 deletions(-) diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml index baa3f821d9b..b04fa34e208 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml @@ -26328,20 +26328,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x224x64_MI16fwZD_SxDnnVOKmLpTDrPrhupLHmkAIN1IXhJjWIM824= + BaseName: Cijk_Ailk_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1J2ed5Fhxf8DLJDiYbpPN48d6A8VIyjP0ZhnMLknaejw= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 + ClusterLocalRead: 1 + CodeObjectVersion: '4' ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: 0 + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -26370,7 +26370,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA1_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 LSCA: 64 LSCB: 64 @@ -26381,13 +26381,13 @@ LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 3072 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126208 + LdsBytesNoAmax: 124160 LdsInitCVgprs: false - LdsNumBytes: 126208 + LdsNumBytes: 124160 LdsNumElementsAlignedA: 24832 - LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -26406,11 +26406,11 @@ LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -26419,14 +26419,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [6, 7] + MIWaveTile: [6, 8] MIWaveTileA: 6 - MIWaveTileB: 7 + MIWaveTileB: 8 MIWaveTileMetadata: 0 MacroTile0: 192 - MacroTile1: 224 + MacroTile1: 256 MacroTileA: 192 - MacroTileB: 224 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -26439,14 +26439,14 @@ MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 - MfmaInitCVgprs: false + MfmaInitCVgprs: true NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: true NonDTLTailLoopB: true NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 4 NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 4 @@ -26454,14 +26454,14 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 168 - NumGlobalWriteVectorsPerThread: 168 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 NumLoadsA: 6 - NumLoadsB: 7 + NumLoadsB: 8 NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 7 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -26472,27 +26472,22 @@ PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 112 - SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA1_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 @@ -26502,9 +26497,9 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 24 - ThreadTile1: 7 + ThreadTile1: 8 ThreadTileA: 24 - ThreadTileB: 7 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -26513,24 +26508,24 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 + UseCustomMainLoopSchedule: true UseDot2F32XEmulation: true UseDotInstruction: false UseF32XEmulation: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 8 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 24 + WorkGroupMapping: 16 WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -26540,13 +26535,11 @@ _DepthUB: 64 _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false + _staggerStrideShift: 0 enableLDSTrA: true enableLDSTrB: false reorderGRInstForDTVA: false From 2b82f8e4a96bc52bc83c55fe2fa2bd360023e755 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Mon, 10 Nov 2025 01:35:56 +0000 Subject: [PATCH 2/4] add test --- .../Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml index 8d0f8886001..c58e20becaf 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml @@ -110,6 +110,7 @@ BenchmarkProblems: BenchmarkFinalParameters: - ProblemSizes: - Range: [[255, 1, 257], [255, 1, 257], [1], [1, 37, 192]] + - Exact: [3264, 3072, 1, 1536] - BiasTypeArgs: ['b'] - # BenchmarkProblemSizeGroup - Standard - All problem InitialSolutionParameters: From 54489ac343b20024d141ea725c35fc4451d1ba26 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Wed, 19 Nov 2025 01:47:11 +0000 Subject: [PATCH 3/4] Revert "Revert "CMS implementation for 192x256x64TN (#2623)"" This reverts commit 974c29692cdb145e986ec20f7f2afa4190d1b219. --- ...k_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml | 246 ++++++++++++++++++ .../Tensile/Components/CustomSchedule.py | 41 +++ .../gfx950/custom_mainloop_scheduling.yaml | 38 ++- 3 files changed, 324 insertions(+), 1 deletion(-) diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml index c434ab07d59..9c4bd13b8f6 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml @@ -306096,6 +306096,250 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1iMO6yjTG4yuZPDkcMXrcObwuUOXV_ipLalV5INSQqYE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123776 + LdsInitCVgprs: false + LdsNumBytes: 123776 + LdsNumElementsAlignedA: 24960 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24960 + LdsOffsetB_Blk: 90496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24960 + LdsOffsetMetadata_Blk: 90496 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumTotalPackedLoadsA: 6 + NumTotalPackedLoadsB: 8 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1308 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: true + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false - [2, 3, 0, 1] - - - [16, 368640, 1, 224] - [0, 0.0] @@ -308735,6 +308979,8 @@ - [1305, 0.0] - - [1920, 2048, 1, 15964] - [1306, 0.0] + - - [3072, 4096, 1, 8192] + - [1308, 0.0] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py index b77a61f5538..b3626904ad3 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py @@ -345,6 +345,47 @@ def _get_schedule_192x256x64_16bit(kernel, useLDSTr, TLDS): SBarrier(comment=""), SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"),] nglshift = nllshift = 14 # vmcnt shift for ngl and nll + elif isTN(kernel) and not useLDSTr and TLDS == 1: + #index and code pair + syncTable = [-1, SWaitCnt(dscnt=7, vlcnt=-1, vscnt=-1, comment="for LRB1-0"), + 6, SWaitCnt(dscnt=6+5, vlcnt=-1, vscnt=-1, comment="for LRB1-1"), + 8, SBarrier(comment="for GRA start"), + 11, SWaitCnt(dscnt=5+8, vlcnt=-1, vscnt=-1, comment="for LRB1-2"), + 17, SWaitCnt(dscnt=4+11, vlcnt=-1, vscnt=-1, comment="for LRB1-3"), + 23, SWaitCnt(dscnt=15, vlcnt=-1, vscnt=-1, comment="for LRB1-4:6"), + 41, SWaitCnt(dscnt=14, vlcnt=-1, vscnt=-1, comment="for LRB1-7"), + 46, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="for LRA1"), + 48, SBarrier(comment="for LRA1 start"), + 78, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="for LRB1"), + 78, SBarrier(comment="for LRB1 start"),] + optSchedule = { + 'SYNC' : [syncTable[::2]], + 'GRIncA': [[0,1,2,3,4,5,6,7,8]], + 'GRIncB': [[9,10,11,12,13,14,15,16,17]], + + 'LRA0' : [[0, 2, 3, 4, 5, 6]], + 'LRB0' : [[7, 9, 11, 13, 15, 17, 19, 21], + [8, 10, 12, 14, 16, 18, 20, 22]], + 'GRA' : [[8,8, 10,10, 12,12, 14,14, 26,26, 31,31], + [9,9, 11,11, 13,13, 15,15, 27,27, 32,32]], + + 'GRB' : [[46,46, 50,50, 54,54, 58,58, 62,62, 66,66, 70,70, 76,76], + [47,47, 51,51, 55,55, 59,59, 63,63, 67,67, 71,71, 77,77]], + 'LRA1' : [[48, 52, 56, 58, 60, 64], + [49, 53, 57, 59, 61, 65]], + # 0 1 2 3 4 5 6 7 + 'LRB1' : [[78, 80, 82, 84, 86, 90, 92, 94], + [79, 81, 83, 85, 87, 91, 93, 95]], + + 'LRSA' : [[22]], + 'LRSB' : [[23]], + + 'LWSA' : [[20]], + 'LWSB' : [[78]], + 'LCC' : [[95, 95]], + } + syncCode = syncTable[1::2] + nglshift = nllshift = 14 # vmcnt shift for ngl and nll else: return False, None diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml index 98e2b98934f..d69fca86d1b 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml @@ -469,7 +469,43 @@ BenchmarkProblems: - Range: [[613], [612], [1], [1, 1, 64]] - Exact: [8192, 4096, 1, 2048] - BiasTypeArgs: ['b'] - + - # BenchmarkProblemSizeGroup - Standard - All problem + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16,32, 1, 1, 6, 8, 2,2 ] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - DepthU: [64] + - ScheduleIterAlg: [3] + - ExpandPointerSwap: [0] + - TransposeLDS: [1] #0,1 + - LocalReadVectorWidth: [8] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - DirectToLds: [1] + - StreamK: [3] + - LdsPadA: [8] #[-1] + - LdsPadB: [8] #[-1] + - StaggerU: [0] + - WorkGroupMapping: [16] + - WorkGroupMappingXCC: [2] + - 1LDSBuffer: [0] + - NonTemporalD: [4] + - SourceSwap: [1] + - UseSgprForGRO: [0] + - UseCustomMainLoopSchedule: [0, 1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [[192], [256], [1], [64, 64, 256]] + - Range: [[192], [256], [1], [1, 1, 64]] + - Range: [[192], [256], [1], [32, 64, 256]] + - Range: [[6144], [8192], [1], [64, 64, 256]] + - Exact: [3072, 4096, 1, 8192] + - BiasTypeArgs: ['b'] ######################################## # HHS TN - standard ######################################## From b5c73e98c94c1a3391c4a7c658cb4aba7721c888 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Wed, 19 Nov 2025 08:50:48 +0000 Subject: [PATCH 4/4] Revert "Revert "Revert "CMS implementation for 192x256x64TN (#2623)""" This reverts commit 54489ac343b20024d141ea725c35fc4451d1ba26. --- ...k_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml | 246 ------------------ .../Tensile/Components/CustomSchedule.py | 41 --- .../gfx950/custom_mainloop_scheduling.yaml | 38 +-- 3 files changed, 1 insertion(+), 324 deletions(-) diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml index 9c4bd13b8f6..c434ab07d59 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml @@ -306096,250 +306096,6 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1iMO6yjTG4yuZPDkcMXrcObwuUOXV_ipLalV5INSQqYE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123776 - LdsInitCVgprs: false - LdsNumBytes: 123776 - LdsNumElementsAlignedA: 24960 - LdsNumElementsAlignedB: 33280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 24960 - LdsOffsetB_Blk: 90496 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24960 - LdsOffsetMetadata_Blk: 90496 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: false - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 8] - MIWaveTileA: 6 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumTotalPackedLoadsA: 6 - NumTotalPackedLoadsB: 8 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1308 - SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 8 - ThreadTileA: 24 - ThreadTileB: 8 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: true - UseDirect32XEmulation: false - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: false - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 8 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - [2, 3, 0, 1] - - - [16, 368640, 1, 224] - [0, 0.0] @@ -308979,8 +308735,6 @@ - [1305, 0.0] - - [1920, 2048, 1, 15964] - [1306, 0.0] - - - [3072, 4096, 1, 8192] - - [1308, 0.0] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py index b3626904ad3..b77a61f5538 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/CustomSchedule.py @@ -345,47 +345,6 @@ def _get_schedule_192x256x64_16bit(kernel, useLDSTr, TLDS): SBarrier(comment=""), SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"),] nglshift = nllshift = 14 # vmcnt shift for ngl and nll - elif isTN(kernel) and not useLDSTr and TLDS == 1: - #index and code pair - syncTable = [-1, SWaitCnt(dscnt=7, vlcnt=-1, vscnt=-1, comment="for LRB1-0"), - 6, SWaitCnt(dscnt=6+5, vlcnt=-1, vscnt=-1, comment="for LRB1-1"), - 8, SBarrier(comment="for GRA start"), - 11, SWaitCnt(dscnt=5+8, vlcnt=-1, vscnt=-1, comment="for LRB1-2"), - 17, SWaitCnt(dscnt=4+11, vlcnt=-1, vscnt=-1, comment="for LRB1-3"), - 23, SWaitCnt(dscnt=15, vlcnt=-1, vscnt=-1, comment="for LRB1-4:6"), - 41, SWaitCnt(dscnt=14, vlcnt=-1, vscnt=-1, comment="for LRB1-7"), - 46, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="for LRA1"), - 48, SBarrier(comment="for LRA1 start"), - 78, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="for LRB1"), - 78, SBarrier(comment="for LRB1 start"),] - optSchedule = { - 'SYNC' : [syncTable[::2]], - 'GRIncA': [[0,1,2,3,4,5,6,7,8]], - 'GRIncB': [[9,10,11,12,13,14,15,16,17]], - - 'LRA0' : [[0, 2, 3, 4, 5, 6]], - 'LRB0' : [[7, 9, 11, 13, 15, 17, 19, 21], - [8, 10, 12, 14, 16, 18, 20, 22]], - 'GRA' : [[8,8, 10,10, 12,12, 14,14, 26,26, 31,31], - [9,9, 11,11, 13,13, 15,15, 27,27, 32,32]], - - 'GRB' : [[46,46, 50,50, 54,54, 58,58, 62,62, 66,66, 70,70, 76,76], - [47,47, 51,51, 55,55, 59,59, 63,63, 67,67, 71,71, 77,77]], - 'LRA1' : [[48, 52, 56, 58, 60, 64], - [49, 53, 57, 59, 61, 65]], - # 0 1 2 3 4 5 6 7 - 'LRB1' : [[78, 80, 82, 84, 86, 90, 92, 94], - [79, 81, 83, 85, 87, 91, 93, 95]], - - 'LRSA' : [[22]], - 'LRSB' : [[23]], - - 'LWSA' : [[20]], - 'LWSB' : [[78]], - 'LCC' : [[95, 95]], - } - syncCode = syncTable[1::2] - nglshift = nllshift = 14 # vmcnt shift for ngl and nll else: return False, None diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml index 74b67b3cf2e..cd1f965d85b 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/custom_mainloop_scheduling.yaml @@ -470,43 +470,7 @@ BenchmarkProblems: - Range: [[613], [612], [1], [1, 1, 64]] - Exact: [8192, 4096, 1, 2048] - BiasTypeArgs: ['b'] - - # BenchmarkProblemSizeGroup - Standard - All problem - InitialSolutionParameters: - BenchmarkCommonParameters: - - KernelLanguage: ["Assembly"] - ForkParameters: - - MatrixInstruction: - - [16, 16,32, 1, 1, 6, 8, 2,2 ] - - PrefetchGlobalRead: [2] - - PrefetchLocalRead: [1] - - DepthU: [64] - - ScheduleIterAlg: [3] - - ExpandPointerSwap: [0] - - TransposeLDS: [1] #0,1 - - LocalReadVectorWidth: [8] - - GlobalReadVectorWidthA: [8] - - GlobalReadVectorWidthB: [8] - - DirectToLds: [1] - - StreamK: [3] - - LdsPadA: [8] #[-1] - - LdsPadB: [8] #[-1] - - StaggerU: [0] - - WorkGroupMapping: [16] - - WorkGroupMappingXCC: [2] - - 1LDSBuffer: [0] - - NonTemporalD: [4] - - SourceSwap: [1] - - UseSgprForGRO: [0] - - UseCustomMainLoopSchedule: [0, 1] - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [[192], [256], [1], [64, 64, 256]] - - Range: [[192], [256], [1], [1, 1, 64]] - - Range: [[192], [256], [1], [32, 64, 256]] - - Range: [[6144], [8192], [1], [64, 64, 256]] - - Exact: [3072, 4096, 1, 8192] - - BiasTypeArgs: ['b'] + ######################################## # HHS TN - standard ########################################