diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_nta4/gfx950_Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_nta4/gfx950_Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs.yaml index 24e7246a10e..af01afcf94f 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_nta4/gfx950_Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_nta4/gfx950_Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs.yaml @@ -160,7 +160,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -316,7 +316,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 0 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -484,7 +484,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -640,7 +640,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -808,7 +808,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -964,7 +964,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 2 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -1132,7 +1132,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -1288,7 +1288,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 3 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -1456,7 +1456,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -1612,7 +1612,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 4 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -1780,7 +1780,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -1936,7 +1936,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 5 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -2104,7 +2104,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -2260,7 +2260,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 6 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -2428,7 +2428,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -2584,7 +2584,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 7 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -2752,7 +2752,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -2908,7 +2908,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 8 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -3076,7 +3076,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -3232,7 +3232,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 9 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -3400,7 +3400,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -3556,7 +3556,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 10 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -3724,7 +3724,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -3880,7 +3880,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 11 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -4048,7 +4048,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -4204,7 +4204,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 12 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -4372,7 +4372,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -4528,7 +4528,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 13 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -4696,7 +4696,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -4852,7 +4852,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 14 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -5020,7 +5020,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -5176,7 +5176,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 15 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -5344,7 +5344,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -5500,7 +5500,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 16 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -5668,7 +5668,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -5824,7 +5824,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 17 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -5992,7 +5992,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -6148,7 +6148,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 18 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -6316,7 +6316,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -6472,7 +6472,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 19 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -6640,7 +6640,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -6796,7 +6796,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 20 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -6964,7 +6964,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -7120,7 +7120,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 21 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -7288,7 +7288,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -7444,7 +7444,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 22 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -7612,7 +7612,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -7768,7 +7768,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 23 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -7936,7 +7936,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -8092,7 +8092,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 24 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -8260,7 +8260,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -8416,7 +8416,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 25 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -8584,7 +8584,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -8740,7 +8740,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 26 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -8908,7 +8908,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -9064,7 +9064,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 27 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -9232,7 +9232,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -9388,7 +9388,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 28 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -9556,7 +9556,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -9712,7 +9712,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 29 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -9880,7 +9880,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -10036,7 +10036,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 30 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -10204,7 +10204,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -10360,7 +10360,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 31 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -10528,7 +10528,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -10684,7 +10684,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 32 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -10852,7 +10852,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -11008,7 +11008,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 33 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -11108,9 +11108,5841 @@ tailLoopOptB: false tailLoopOptMXSA: false tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT12pSBqfd5dzj78eX1t5EQJzYZ5aHFVtQ9uJHS361sw3qY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17664 + LdsOffsetB_Blk: 50432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 17664 + LdsOffsetMetadata_Blk: 50432 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileMXSA: 128 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT12lYJO4P9-MC57Obi2kvcZAOwe_1ysdtWgX7-VIBNgQpw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63232 + LdsInitCVgprs: false + LdsNumBytes: 63232 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18176 + LdsOffsetB_Blk: 50944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 18176 + LdsOffsetMetadata_Blk: 50944 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileMXSA: 128 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT25ccxCqQ2dvGclrJYn0QqExpGJSq56GiHUJPL0jzB0qVw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104704 + LdsInitCVgprs: false + LdsNumBytes: 104704 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35072 + LdsOffsetB_Blk: 100608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35072 + LdsOffsetMetadata_Blk: 100608 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MacroTileMXSA: 256 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + ThreadTileMXSA: 16 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT25LD47Rd1FCRkyTzzAeZkhMOPqncN77I3VtqjftUOo3W0= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113408 + LdsInitCVgprs: false + LdsNumBytes: 113408 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35584 + LdsOffsetB_Blk: 101120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35584 + LdsOffsetMetadata_Blk: 101120 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 96 + MacroTileA: 256 + MacroTileB: 96 + MacroTileMXSA: 256 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + ThreadTileMXSA: 16 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT252kruXsNK48kysY3byfZ8nkfbVQgYyySEfltsnGOOr7I= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 5 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x160x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 160 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 20480 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 36096 + LdsOffsetB_Blk: 101632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 36096 + LdsOffsetMetadata_Blk: 101632 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MacroTileMXSA: 256 + MacroTileMXSB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x160x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + ThreadTileMXSA: 16 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT25K1mXD-Z64cKGl1n64cEhRIvEdarIIYY6qzM6iXFNutU= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 9 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x288x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_18_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 288 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 36864 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 37120 + LdsOffsetB_Blk: 111104 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 106752 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 108800 + LdsOffsetMetadata: 37120 + LdsOffsetMetadata_Blk: 111104 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 18] + MIWaveTileA: 4 + MIWaveTileB: 18 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 18 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 288 + MacroTileA: 256 + MacroTileB: 288 + MacroTileMXSA: 256 + MacroTileMXSB: 288 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 9 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x288x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_18_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 18 + ThreadTileA: 16 + ThreadTileB: 18 + ThreadTileMXSA: 16 + ThreadTileMXSB: 18 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT382wXw8HnHr0540PBKyCMkiG1GnHC8Qq6QTA2UY90u0Us= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52480 + LdsOffsetB_Blk: 118016 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52480 + LdsOffsetMetadata_Blk: 118016 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 32 + MacroTileA: 384 + MacroTileB: 32 + MacroTileMXSA: 384 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT38iWtaR8v4gfASTnshP-7Zt37fJ-3bIKngSBlzEMd0qgE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130816 + LdsInitCVgprs: false + LdsNumBytes: 130816 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52992 + LdsOffsetB_Blk: 118528 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52992 + LdsOffsetMetadata_Blk: 118528 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 96 + MacroTileA: 384 + MacroTileB: 96 + MacroTileMXSA: 384 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT51My4B9dUWEKuVinUMAqyTn-9nVKbOgw4LZNuajpvbgno= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 16 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT512x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 512 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 4096 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 69888 + LdsOffsetB_Blk: 143872 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 139520 + LdsOffsetMXSB: 69632 + LdsOffsetMXSB_Blk: 143616 + LdsOffsetMetadata: 69888 + LdsOffsetMetadata_Blk: 143872 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 32 + MacroTileA: 512 + MacroTileB: 32 + MacroTileMXSA: 512 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT512x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + ThreadTileMXSA: 32 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32dv3nH_ug3zWfzXFJEJHJUvkpjXQjAQVml90QMZvwYvs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5376 + LdsOffsetB_Blk: 38144 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 36864 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 37120 + LdsOffsetMetadata: 5376 + LdsOffsetMetadata_Blk: 38144 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileMXSA: 32 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT96VdJ139l5ExTu5vrbSyr9YIDnqGsFY6rZMfGBgYGwyok= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63232 + LdsInitCVgprs: false + LdsNumBytes: 63232 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 14080 + LdsOffsetB_Blk: 46848 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 45056 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 45824 + LdsOffsetMetadata: 14080 + LdsOffsetMetadata_Blk: 46848 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileMXSA: 96 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32pDTSO9Ei8iprYUZVwZ8MVK8XTu8S8ugrlkQJj7C1FJQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104704 + LdsInitCVgprs: false + LdsNumBytes: 104704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 6400 + LdsOffsetB_Blk: 71936 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 69632 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 69888 + LdsOffsetMetadata: 6400 + LdsOffsetMetadata_Blk: 71936 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MacroTileMXSA: 32 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + ThreadTileMXSA: 8 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT96Cj2Z80w1PkWDv8Q4NtOmErGoRYnlYoT1xjNw40Qn6y8= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT96x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113408 + LdsInitCVgprs: false + LdsNumBytes: 113408 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15104 + LdsOffsetB_Blk: 80640 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 78592 + LdsOffsetMetadata: 15104 + LdsOffsetMetadata_Blk: 80640 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 256 + MacroTileA: 96 + MacroTileB: 256 + MacroTileMXSA: 96 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT96x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + ThreadTileMXSA: 24 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT16rkdIx0xJ9AdmxSLTSuPL_HwvXURN3d9JBsT0i2N2r4A= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 5 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT160x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 160 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1280 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23808 + LdsOffsetB_Blk: 89344 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 20480 + LdsOffsetMXSA_Blk: 86016 + LdsOffsetMXSB: 21760 + LdsOffsetMXSB_Blk: 87296 + LdsOffsetMetadata: 23808 + LdsOffsetMetadata_Blk: 89344 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 + MacroTileMXSA: 160 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT160x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 + ThreadTileMXSA: 40 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT28HX9AX04H5gx2Hnt9CFkwE4Io5BNr8Him7sxPkQ6FxSI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 9 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT288x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT18_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 288 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 36864 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2304 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 41216 + LdsOffsetB_Blk: 115200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 36864 + LdsOffsetMXSA_Blk: 110848 + LdsOffsetMXSB: 39168 + LdsOffsetMXSB_Blk: 113152 + LdsOffsetMetadata: 41216 + LdsOffsetMetadata_Blk: 115200 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [18, 4] + MIWaveTileA: 18 + MIWaveTileB: 4 + MIWaveTileMXSA: 18 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 288 + MacroTile1: 256 + MacroTileA: 288 + MacroTileB: 256 + MacroTileMXSA: 288 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 9 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 9 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT288x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT18_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 72 + ThreadTile1: 4 + ThreadTileA: 72 + ThreadTileB: 4 + ThreadTileMXSA: 72 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT323cjBEjqTj-Mu9ZF1RsqJ73WqkJOeA3kpk9e2rgOyJEQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 7424 + LdsOffsetB_Blk: 72960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 69632 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 69888 + LdsOffsetMetadata: 7424 + LdsOffsetMetadata_Blk: 72960 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 384 + MacroTileA: 32 + MacroTileB: 384 + MacroTileMXSA: 32 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32k1L8EqrYflL4kvotLP49gssYPXLs5aDdnaskyCeVRw4= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 512 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 65536 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 4096 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 78080 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 78336 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 512 + MacroTileA: 32 + MacroTileB: 512 + MacroTileMXSA: 32 + MacroTileMXSB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + ThreadTileMXSA: 8 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT9680-vsketBKOwATFxdutsK5i_NcnvLOonACpwOS25BUY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT96x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130816 + LdsInitCVgprs: false + LdsNumBytes: 130816 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16128 + LdsOffsetB_Blk: 81664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 78592 + LdsOffsetMetadata: 16128 + LdsOffsetMetadata_Blk: 81664 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 384 + MacroTileA: 96 + MacroTileB: 384 + MacroTileMXSA: 96 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT96x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false - null - null - null - null - DeviceEfficiency -- Prediction +- Prediction \ No newline at end of file diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_ntb4/gfx950_Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_ntb4/gfx950_Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs.yaml index c8a65d5b849..891dbe7609c 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_ntb4/gfx950_Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/Origami_ntb4/gfx950_Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs.yaml @@ -160,7 +160,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -316,7 +316,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 0 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -484,7 +484,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -640,7 +640,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -808,7 +808,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -964,7 +964,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 2 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -1132,7 +1132,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -1288,7 +1288,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 3 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -1456,7 +1456,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -1612,7 +1612,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 4 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -1780,7 +1780,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -1936,7 +1936,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 5 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -2104,7 +2104,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -2260,7 +2260,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 6 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -2428,7 +2428,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -2584,7 +2584,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 7 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -2752,7 +2752,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -2908,7 +2908,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 8 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -3076,7 +3076,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -3232,7 +3232,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 9 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -3400,7 +3400,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -3556,7 +3556,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 10 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -3724,7 +3724,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -3880,7 +3880,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 11 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -4048,7 +4048,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -4204,7 +4204,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 12 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -4372,7 +4372,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -4528,7 +4528,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 13 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -4696,7 +4696,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -4852,7 +4852,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 14 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -5020,7 +5020,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -5176,7 +5176,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 15 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -5344,7 +5344,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -5500,7 +5500,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 16 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -5668,7 +5668,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -5824,7 +5824,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 17 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT320x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -5992,7 +5992,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -6148,7 +6148,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 18 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -6316,7 +6316,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -6472,7 +6472,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 19 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -6640,7 +6640,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -6796,7 +6796,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 20 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_16_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -6964,7 +6964,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -7120,7 +7120,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 21 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT512x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT16_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -7288,7 +7288,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -7444,7 +7444,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 22 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -7612,7 +7612,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -7768,7 +7768,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 23 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT320x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -7936,7 +7936,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -8092,7 +8092,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 24 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -8260,7 +8260,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -8416,7 +8416,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 25 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -8584,7 +8584,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -8740,7 +8740,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 26 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT320x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -8908,7 +8908,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -9064,7 +9064,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 27 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x320x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -9232,7 +9232,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -9388,7 +9388,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 28 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT64x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -9556,7 +9556,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -9712,7 +9712,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 29 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT448x64x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -9880,7 +9880,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -10036,7 +10036,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 30 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x448x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_14_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -10204,7 +10204,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -10360,7 +10360,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 31 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT448x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT14_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -10528,7 +10528,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -10684,7 +10684,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 32 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT192x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_12_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -10852,7 +10852,7 @@ KRingShift: false Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 256 LSCB: 256 @@ -11008,7 +11008,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 33 - SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x192x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT12_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 SourceSwap: false SpaceFillingAlgo: [] StaggerU: 0 @@ -11108,9 +11108,5841 @@ tailLoopOptB: false tailLoopOptMXSA: false tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT12pSBqfd5dzj78eX1t5EQJzYZ5aHFVtQ9uJHS361sw3qY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17664 + LdsOffsetB_Blk: 50432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 17664 + LdsOffsetMetadata_Blk: 50432 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileMXSA: 128 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT12lYJO4P9-MC57Obi2kvcZAOwe_1ysdtWgX7-VIBNgQpw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63232 + LdsInitCVgprs: false + LdsNumBytes: 63232 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18176 + LdsOffsetB_Blk: 50944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 18176 + LdsOffsetMetadata_Blk: 50944 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileMXSA: 128 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT25ccxCqQ2dvGclrJYn0QqExpGJSq56GiHUJPL0jzB0qVw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104704 + LdsInitCVgprs: false + LdsNumBytes: 104704 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35072 + LdsOffsetB_Blk: 100608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35072 + LdsOffsetMetadata_Blk: 100608 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MacroTileMXSA: 256 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + ThreadTileMXSA: 16 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT25LD47Rd1FCRkyTzzAeZkhMOPqncN77I3VtqjftUOo3W0= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113408 + LdsInitCVgprs: false + LdsNumBytes: 113408 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35584 + LdsOffsetB_Blk: 101120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35584 + LdsOffsetMetadata_Blk: 101120 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 96 + MacroTileA: 256 + MacroTileB: 96 + MacroTileMXSA: 256 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + ThreadTileMXSA: 16 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT252kruXsNK48kysY3byfZ8nkfbVQgYyySEfltsnGOOr7I= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 5 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x160x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 160 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 20480 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 36096 + LdsOffsetB_Blk: 101632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 36096 + LdsOffsetMetadata_Blk: 101632 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MacroTileMXSA: 256 + MacroTileMXSB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x160x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + ThreadTileMXSA: 16 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT25K1mXD-Z64cKGl1n64cEhRIvEdarIIYY6qzM6iXFNutU= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 9 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x288x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_18_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 288 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 36864 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 37120 + LdsOffsetB_Blk: 111104 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 106752 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 108800 + LdsOffsetMetadata: 37120 + LdsOffsetMetadata_Blk: 111104 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 18] + MIWaveTileA: 4 + MIWaveTileB: 18 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 18 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 288 + MacroTileA: 256 + MacroTileB: 288 + MacroTileMXSA: 256 + MacroTileMXSB: 288 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 9 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x288x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_18_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 18 + ThreadTileA: 16 + ThreadTileB: 18 + ThreadTileMXSA: 16 + ThreadTileMXSB: 18 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT382wXw8HnHr0540PBKyCMkiG1GnHC8Qq6QTA2UY90u0Us= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52480 + LdsOffsetB_Blk: 118016 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52480 + LdsOffsetMetadata_Blk: 118016 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 32 + MacroTileA: 384 + MacroTileB: 32 + MacroTileMXSA: 384 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT38iWtaR8v4gfASTnshP-7Zt37fJ-3bIKngSBlzEMd0qgE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130816 + LdsInitCVgprs: false + LdsNumBytes: 130816 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52992 + LdsOffsetB_Blk: 118528 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52992 + LdsOffsetMetadata_Blk: 118528 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 96 + MacroTileA: 384 + MacroTileB: 96 + MacroTileMXSA: 384 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT51My4B9dUWEKuVinUMAqyTn-9nVKbOgw4LZNuajpvbgno= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 16 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT512x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 512 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 4096 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 69888 + LdsOffsetB_Blk: 143872 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 139520 + LdsOffsetMXSB: 69632 + LdsOffsetMXSB_Blk: 143616 + LdsOffsetMetadata: 69888 + LdsOffsetMetadata_Blk: 143872 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 32 + MacroTileA: 512 + MacroTileB: 32 + MacroTileMXSA: 512 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT512x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + ThreadTileMXSA: 32 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32dv3nH_ug3zWfzXFJEJHJUvkpjXQjAQVml90QMZvwYvs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5376 + LdsOffsetB_Blk: 38144 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 36864 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 37120 + LdsOffsetMetadata: 5376 + LdsOffsetMetadata_Blk: 38144 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileMXSA: 32 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT96VdJ139l5ExTu5vrbSyr9YIDnqGsFY6rZMfGBgYGwyok= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63232 + LdsInitCVgprs: false + LdsNumBytes: 63232 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 14080 + LdsOffsetB_Blk: 46848 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 45056 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 45824 + LdsOffsetMetadata: 14080 + LdsOffsetMetadata_Blk: 46848 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileMXSA: 96 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32pDTSO9Ei8iprYUZVwZ8MVK8XTu8S8ugrlkQJj7C1FJQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104704 + LdsInitCVgprs: false + LdsNumBytes: 104704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 6400 + LdsOffsetB_Blk: 71936 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 69632 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 69888 + LdsOffsetMetadata: 6400 + LdsOffsetMetadata_Blk: 71936 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MacroTileMXSA: 32 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + ThreadTileMXSA: 8 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT96Cj2Z80w1PkWDv8Q4NtOmErGoRYnlYoT1xjNw40Qn6y8= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT96x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113408 + LdsInitCVgprs: false + LdsNumBytes: 113408 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15104 + LdsOffsetB_Blk: 80640 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 78592 + LdsOffsetMetadata: 15104 + LdsOffsetMetadata_Blk: 80640 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 256 + MacroTileA: 96 + MacroTileB: 256 + MacroTileMXSA: 96 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT96x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + ThreadTileMXSA: 24 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT16rkdIx0xJ9AdmxSLTSuPL_HwvXURN3d9JBsT0i2N2r4A= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 5 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT160x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 160 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1280 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23808 + LdsOffsetB_Blk: 89344 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 20480 + LdsOffsetMXSA_Blk: 86016 + LdsOffsetMXSB: 21760 + LdsOffsetMXSB_Blk: 87296 + LdsOffsetMetadata: 23808 + LdsOffsetMetadata_Blk: 89344 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 + MacroTileMXSA: 160 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT160x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 + ThreadTileMXSA: 40 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT28HX9AX04H5gx2Hnt9CFkwE4Io5BNr8Him7sxPkQ6FxSI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 9 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT288x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT18_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 288 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 36864 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2304 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 41216 + LdsOffsetB_Blk: 115200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 36864 + LdsOffsetMXSA_Blk: 110848 + LdsOffsetMXSB: 39168 + LdsOffsetMXSB_Blk: 113152 + LdsOffsetMetadata: 41216 + LdsOffsetMetadata_Blk: 115200 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [18, 4] + MIWaveTileA: 18 + MIWaveTileB: 4 + MIWaveTileMXSA: 18 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 288 + MacroTile1: 256 + MacroTileA: 288 + MacroTileB: 256 + MacroTileMXSA: 288 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 9 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 9 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT288x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT18_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 72 + ThreadTile1: 4 + ThreadTileA: 72 + ThreadTileB: 4 + ThreadTileMXSA: 72 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT323cjBEjqTj-Mu9ZF1RsqJ73WqkJOeA3kpk9e2rgOyJEQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 7424 + LdsOffsetB_Blk: 72960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 69632 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 69888 + LdsOffsetMetadata: 7424 + LdsOffsetMetadata_Blk: 72960 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 384 + MacroTileA: 32 + MacroTileB: 384 + MacroTileMXSA: 32 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32k1L8EqrYflL4kvotLP49gssYPXLs5aDdnaskyCeVRw4= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 512 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 65536 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 4096 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 78080 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 78336 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 512 + MacroTileA: 32 + MacroTileB: 512 + MacroTileMXSA: 32 + MacroTileMXSB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + ThreadTileMXSA: 8 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT9680-vsketBKOwATFxdutsK5i_NcnvLOonACpwOS25BUY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT96x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130816 + LdsInitCVgprs: false + LdsNumBytes: 130816 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16128 + LdsOffsetB_Blk: 81664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 78592 + LdsOffsetMetadata: 16128 + LdsOffsetMetadata_Blk: 81664 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 384 + MacroTileA: 96 + MacroTileB: 384 + MacroTileMXSA: 96 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT96x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false - null - null - null - null - DeviceEfficiency -- Prediction +- Prediction \ No newline at end of file diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/gfx950_Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/gfx950_Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs.yaml index 7e1b9bc847c..662290b4498 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/gfx950_Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/gfx950/Origami/gfx950_Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs.yaml @@ -11108,6 +11108,5838 @@ tailLoopOptB: false tailLoopOptMXSA: false tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT12pSBqfd5dzj78eX1t5EQJzYZ5aHFVtQ9uJHS361sw3qY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17664 + LdsOffsetB_Blk: 50432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 17664 + LdsOffsetMetadata_Blk: 50432 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileMXSA: 128 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT12lYJO4P9-MC57Obi2kvcZAOwe_1ysdtWgX7-VIBNgQpw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63232 + LdsInitCVgprs: false + LdsNumBytes: 63232 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 1024 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18176 + LdsOffsetB_Blk: 50944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 17408 + LdsOffsetMXSB_Blk: 50176 + LdsOffsetMetadata: 18176 + LdsOffsetMetadata_Blk: 50944 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileMXSA: 128 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT128x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT25ccxCqQ2dvGclrJYn0QqExpGJSq56GiHUJPL0jzB0qVw= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104704 + LdsInitCVgprs: false + LdsNumBytes: 104704 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35072 + LdsOffsetB_Blk: 100608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35072 + LdsOffsetMetadata_Blk: 100608 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MacroTileMXSA: 256 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + ThreadTileMXSA: 16 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT25LD47Rd1FCRkyTzzAeZkhMOPqncN77I3VtqjftUOo3W0= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113408 + LdsInitCVgprs: false + LdsNumBytes: 113408 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 35584 + LdsOffsetB_Blk: 101120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 35584 + LdsOffsetMetadata_Blk: 101120 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 6] + MIWaveTileA: 4 + MIWaveTileB: 6 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 96 + MacroTileA: 256 + MacroTileB: 96 + MacroTileMXSA: 256 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 6 + ThreadTileA: 16 + ThreadTileB: 6 + ThreadTileMXSA: 16 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT252kruXsNK48kysY3byfZ8nkfbVQgYyySEfltsnGOOr7I= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 5 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x160x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 160 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 20480 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 36096 + LdsOffsetB_Blk: 101632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 98304 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 100352 + LdsOffsetMetadata: 36096 + LdsOffsetMetadata_Blk: 101632 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 10] + MIWaveTileA: 4 + MIWaveTileB: 10 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 10 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MacroTileMXSA: 256 + MacroTileMXSB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x160x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_10_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 10 + ThreadTileA: 16 + ThreadTileB: 10 + ThreadTileMXSA: 16 + ThreadTileMXSB: 10 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT25K1mXD-Z64cKGl1n64cEhRIvEdarIIYY6qzM6iXFNutU= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 4 + GlobalReadVectorWidthMXSB: 9 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x288x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_18_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 128 + LSPMXSB: 288 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 36864 + LdsNumElementsAlignedMXSA: 2048 + LdsNumElementsAlignedMXSB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 37120 + LdsOffsetB_Blk: 111104 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 32768 + LdsOffsetMXSA_Blk: 106752 + LdsOffsetMXSB: 34816 + LdsOffsetMXSB_Blk: 108800 + LdsOffsetMetadata: 37120 + LdsOffsetMetadata_Blk: 111104 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 18] + MIWaveTileA: 4 + MIWaveTileB: 18 + MIWaveTileMXSA: 4 + MIWaveTileMXSB: 18 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 288 + MacroTileA: 256 + MacroTileB: 288 + MacroTileMXSA: 256 + MacroTileMXSB: 288 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 2 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 9 + NumLoadsPerpendicularMXSA: 2 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT256x288x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT4_18_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 18 + ThreadTileA: 16 + ThreadTileB: 18 + ThreadTileMXSA: 16 + ThreadTileMXSB: 18 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT382wXw8HnHr0540PBKyCMkiG1GnHC8Qq6QTA2UY90u0Us= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52480 + LdsOffsetB_Blk: 118016 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52480 + LdsOffsetMetadata_Blk: 118016 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 32 + MacroTileA: 384 + MacroTileB: 32 + MacroTileMXSA: 384 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT38iWtaR8v4gfASTnshP-7Zt37fJ-3bIKngSBlzEMd0qgE= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 12 + GlobalReadVectorWidthMXSB: 3 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 384 + LSPMXSB: 96 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 3 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130816 + LdsInitCVgprs: false + LdsNumBytes: 130816 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMXSA: 3072 + LdsNumElementsAlignedMXSB: 768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 52992 + LdsOffsetB_Blk: 118528 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 49152 + LdsOffsetMXSA_Blk: 114688 + LdsOffsetMXSB: 52224 + LdsOffsetMXSB_Blk: 117760 + LdsOffsetMetadata: 52992 + LdsOffsetMetadata_Blk: 118528 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 96 + MacroTileA: 384 + MacroTileB: 96 + MacroTileMXSA: 384 + MacroTileMXSB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 12 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 12 + NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT384x96x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT51My4B9dUWEKuVinUMAqyTn-9nVKbOgw4LZNuajpvbgno= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 16 + GlobalReadVectorWidthMXSB: 1 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT512x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 512 + LSPMXSB: 32 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 8 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedMXSA: 4096 + LdsNumElementsAlignedMXSB: 256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 69888 + LdsOffsetB_Blk: 143872 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 139520 + LdsOffsetMXSB: 69632 + LdsOffsetMXSB_Blk: 143616 + LdsOffsetMetadata: 69888 + LdsOffsetMetadata_Blk: 143872 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [8, 2] + MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMXSA: 8 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 512 + MacroTile1: 32 + MacroTileA: 512 + MacroTileB: 32 + MacroTileMXSA: 512 + MacroTileMXSB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT512x32x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT8_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SubGroupMXSA: 16 + SubGroupMXSB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + ThreadTileMXSA: 32 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32dv3nH_ug3zWfzXFJEJHJUvkpjXQjAQVml90QMZvwYvs= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 54528 + LdsInitCVgprs: false + LdsNumBytes: 54528 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5376 + LdsOffsetB_Blk: 38144 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 36864 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 37120 + LdsOffsetMetadata: 5376 + LdsOffsetMetadata_Blk: 38144 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileMXSA: 32 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + ThreadTileMXSA: 8 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT96VdJ139l5ExTu5vrbSyr9YIDnqGsFY6rZMfGBgYGwyok= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 63232 + LdsInitCVgprs: false + LdsNumBytes: 63232 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 1024 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 14080 + LdsOffsetB_Blk: 46848 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 45056 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 45824 + LdsOffsetMetadata: 14080 + LdsOffsetMetadata_Blk: 46848 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileMXSA: 96 + MacroTileMXSB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT96x128x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_2_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 2 + ThreadTileA: 24 + ThreadTileB: 2 + ThreadTileMXSA: 24 + ThreadTileMXSB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32pDTSO9Ei8iprYUZVwZ8MVK8XTu8S8ugrlkQJj7C1FJQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 104704 + LdsInitCVgprs: false + LdsNumBytes: 104704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 6400 + LdsOffsetB_Blk: 71936 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 69632 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 69888 + LdsOffsetMetadata: 6400 + LdsOffsetMetadata_Blk: 71936 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MacroTileMXSA: 32 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + ThreadTileMXSA: 8 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT96Cj2Z80w1PkWDv8Q4NtOmErGoRYnlYoT1xjNw40Qn6y8= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT96x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 113408 + LdsInitCVgprs: false + LdsNumBytes: 113408 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 15104 + LdsOffsetB_Blk: 80640 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 78592 + LdsOffsetMetadata: 15104 + LdsOffsetMetadata_Blk: 80640 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 256 + MacroTileA: 96 + MacroTileB: 256 + MacroTileMXSA: 96 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT96x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + ThreadTileMXSA: 24 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT16rkdIx0xJ9AdmxSLTSuPL_HwvXURN3d9JBsT0i2N2r4A= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 5 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT160x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 160 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 2 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 1280 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23808 + LdsOffsetB_Blk: 89344 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 20480 + LdsOffsetMXSA_Blk: 86016 + LdsOffsetMXSB: 21760 + LdsOffsetMXSB_Blk: 87296 + LdsOffsetMetadata: 23808 + LdsOffsetMetadata_Blk: 89344 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 4] + MIWaveTileA: 10 + MIWaveTileB: 4 + MIWaveTileMXSA: 10 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 + MacroTileMXSA: 160 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLdsBlk: 2 + NumLoadsA: 5 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT160x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT10_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 4 + ThreadTileA: 40 + ThreadTileB: 4 + ThreadTileMXSA: 40 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT28HX9AX04H5gx2Hnt9CFkwE4Io5BNr8Him7sxPkQ6FxSI= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 9 + GlobalReadVectorWidthMXSB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT288x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT18_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 288 + LSPMXSB: 128 + LVCA: 8 + LVCB: 8 + LVCMXSA: 1 + LVCMXSB: 2 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 36864 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 2304 + LdsNumElementsAlignedMXSB: 2048 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 41216 + LdsOffsetB_Blk: 115200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 36864 + LdsOffsetMXSA_Blk: 110848 + LdsOffsetMXSB: 39168 + LdsOffsetMXSB_Blk: 113152 + LdsOffsetMetadata: 41216 + LdsOffsetMetadata_Blk: 115200 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [18, 4] + MIWaveTileA: 18 + MIWaveTileB: 4 + MIWaveTileMXSA: 18 + MIWaveTileMXSB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 288 + MacroTile1: 256 + MacroTileA: 288 + MacroTileB: 256 + MacroTileMXSA: 288 + MacroTileMXSB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 288 + NumLdsBlk: 2 + NumLoadsA: 9 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 2 + NumLoadsPerpendicularA: 9 + NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 2 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT288x256x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT18_4_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 72 + ThreadTile1: 4 + ThreadTileA: 72 + ThreadTileB: 4 + ThreadTileMXSA: 72 + ThreadTileMXSB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT323cjBEjqTj-Mu9ZF1RsqJ73WqkJOeA3kpk9e2rgOyJEQ= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 122112 + LdsInitCVgprs: false + LdsNumBytes: 122112 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 7424 + LdsOffsetB_Blk: 72960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 69632 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 69888 + LdsOffsetMetadata: 7424 + LdsOffsetMetadata_Blk: 72960 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 384 + MacroTileA: 32 + MacroTileB: 384 + MacroTileMXSA: 32 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + ThreadTileMXSA: 8 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32k1L8EqrYflL4kvotLP49gssYPXLs5aDdnaskyCeVRw4= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 1 + GlobalReadVectorWidthMXSB: 16 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 32 + LSPMXSB: 512 + LVCA: 8 + LVCB: 8 + LVCMXSA: 8 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 147968 + LdsInitCVgprs: false + LdsNumBytes: 147968 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 65536 + LdsNumElementsAlignedMXSA: 256 + LdsNumElementsAlignedMXSB: 4096 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 73984 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 4096 + LdsOffsetMXSA_Blk: 78080 + LdsOffsetMXSB: 4352 + LdsOffsetMXSB_Blk: 78336 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMXSA: 2 + MIWaveTileMXSB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 512 + MacroTileA: 32 + MacroTileB: 512 + MacroTileMXSA: 32 + MacroTileMXSB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT32x512x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT2_8_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + ThreadTileMXSA: 8 + ThreadTileMXSB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT9680-vsketBKOwATFxdutsK5i_NcnvLOonACpwOS25BUY= + BufferLoad: true + BufferStore: 1 + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 1 + DirectToLdsA: true + DirectToLdsB: true + DirectToLdsMXSA: true + DirectToLdsMXSB: true + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 32 + GlobalReadVectorWidthB: 32 + GlobalReadVectorWidthMXSA: 3 + GlobalReadVectorWidthMXSB: 12 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT96x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSCMXSA: 8 + LSCMXSB: 8 + LSPA: 32 + LSPB: 32 + LSPMXSA: 96 + LSPMXSB: 384 + LVCA: 8 + LVCB: 8 + LVCMXSA: 3 + LVCMXSB: 1 + LVPA: 1 + LVPB: 1 + LVPMXSA: 32 + LVPMXSB: 32 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130816 + LdsInitCVgprs: false + LdsNumBytes: 130816 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMXSA: 768 + LdsNumElementsAlignedMXSB: 3072 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16128 + LdsOffsetB_Blk: 81664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 12288 + LdsOffsetMXSA_Blk: 77824 + LdsOffsetMXSB: 13056 + LdsOffsetMXSB_Blk: 78592 + LdsOffsetMetadata: 16128 + LdsOffsetMetadata_Blk: 81664 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 32 + LocalReadVectorWidthB: 32 + LocalReadVectorWidthMXSA: 1 + LocalReadVectorWidthMXSB: 1 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LocalWriteUseSgprMXSA: true + LocalWriteUseSgprMXSB: true + LoopIters: 2 + LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 128, 1, 1, 1] + MIInputPerThread: 32 + MIInputPerThreadA: 32 + MIInputPerThreadB: 32 + MIInputPerThreadMXSA: 1 + MIInputPerThreadMXSB: 1 + MIInputPerThreadMetadata: 32 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 6] + MIWaveTileA: 6 + MIWaveTileB: 6 + MIWaveTileMXSA: 6 + MIWaveTileMXSB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 384 + MacroTileA: 96 + MacroTileB: 384 + MacroTileMXSA: 96 + MacroTileMXSB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 128 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 128, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 3 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonDTLTailLoopMXSA: true + NonDTLTailLoopMXSB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLdsBlk: 2 + NumLoadsA: 3 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsCoalescedMXSA: 1 + NumLoadsCoalescedMXSB: 1 + NumLoadsMXSA: 1 + NumLoadsMXSB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularMXSA: 1 + NumLoadsPerpendicularMXSB: 1 + NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumTotalPackedLoadsMXSA: -1 + NumTotalPackedLoadsMXSB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_F4BS_MXA32_MXB32_BH_UserArgs_MT96x384x256_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM32_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA32_GRVWB32_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV0_MIWT6_6_MO40_MGRIPM3_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SGROB0_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_SGRO0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL0_USLMX0_UIOFGRO0_UPLRP0_USFGROn1_USI1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC2_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SubGroupMXSA: 4 + SubGroupMXSB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 6 + ThreadTileA: 24 + ThreadTileB: 6 + ThreadTileMXSA: 24 + ThreadTileMXSB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMXSA: true + UnrollMajorLDSMXSB: true + Use64bShadowLimit: false + Use64bShadowLimitMX: false + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + UseSubtileImpl: true + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + VectorWidthMXSA: 1 + VectorWidthMXSB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMXSA: 0 + WaveSeparateGlobalReadMXSB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 2 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMXSA: 8 + _DepthUMXSB: 8 + _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + tailLoopOptMXSA: false + tailLoopOptMXSB: false - null - null - null diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/SubtileBasedKernel.py b/projects/hipblaslt/tensilelite/Tensile/Components/SubtileBasedKernel.py index 24f5228f176..8e5f5ea5c95 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/SubtileBasedKernel.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/SubtileBasedKernel.py @@ -8,6 +8,20 @@ from ..Common import printWarning, roundUp, print2, DebugConfig, DataDirection, \ INDEX_CHARS, IsaVersion + +from rocisa.code import Module, TextBlock, StructuredModule, KernelBody, Label +from rocisa.label import LabelManager + +from rocisa.container import MUBUFModifiers, vgpr, sgpr, accvgpr, mgpr +from rocisa.enum import InstType, SelectBit, CacheScope +from rocisa.instruction import MFMAInstruction + +import math +from copy import deepcopy +from dataclasses import dataclass, field +from typing import Dict, List, NamedTuple, Optional, Tuple, Type +from contextlib import contextmanager +from collections import deque from rocisa import rocIsa, countInstruction, countGlobalRead, \ countLocalRead, countLocalWrite, countDSStoreB256, getMFMAs from rocisa.asmpass import rocIsaPass, rocIsaPassOption @@ -1245,25 +1259,36 @@ def globalReadScalePtrUpdates(tc, writer, kernel): def emitSingleBufferLoad(tileInfo, kernel, sId0, sId1): """Emit buffer_load instructions for a single subtile (sId0, sId1). + When loadRatioGR > 1, multiple local subtiles share the same global read. + Only the first subtile in each group emits the load; others return empty. + Args: tileInfo: TileInfo for the tensor component sId0: Subtile row index sId1: Subtile column index (K-dimension) """ module = Module() - tc = tileInfo.tc + subtileInfo = tileInfo.localSubtiles[tileInfo.getLocalSubtileLinearId(sId0, sId1)] + grBaseId = subtileInfo.globalReadMap[0] + + # When loadRatioGR > 1, multiple subtiles share one global read. + # Only emit the load for the first subtile of each group. + if tileInfo.loadRatioGR > 1: + linearId = tileInfo.getLocalSubtileLinearId(sId0, sId1) + firstInGroup = int(grBaseId * tileInfo.loadRatioGR) + if linearId != firstInGroup: + return module + + tc = tileInfo.tc isGlc = bool(kernel["NonTemporal%s"%tc] & 0x1) isSlc = bool(kernel["NonTemporal%s"%tc] & 0x2) isNT = bool(kernel["NonTemporal%s"%tc] & 0x4) - subtileInfo = tileInfo.localSubtiles[tileInfo.getLocalSubtileLinearId(sId0, sId1)] regList = tileInfo.localSubtilesRegister[subtileInfo.regListId] offsetK = sId1 * int(tileInfo.mmaTileShape[1] * tileInfo.subtileShape[1] * tileInfo.bpe) - # TODO: grBaseId is probably not needed.. - grBaseId = subtileInfo.globalReadMap[0] - + subtileOffset = math.ceil(tileInfo.loadRatioGR*tileInfo.subtileSize) WriteBaseAddr = "LocalWriteBaseAddr%s"%tc # Emit number of buffer loads equal to number of loads needed to load a subtile @@ -1297,17 +1322,10 @@ def globalReadDoSubtile(tc, writer, kernel): tileInfo = writer.states.a.tileInfo if tc == 'A' else writer.states.b.tileInfo - grTracker = set() for j in range(tileInfo.localSubtileGrid[1]): for i in range(tileInfo.localSubtileGrid[0]): - grIds = tileInfo.localSubtiles[tileInfo.getLocalSubtileLinearId(i ,j)].globalReadMap - if not set(grIds).issubset(grTracker): - for grId in grIds: - grTracker.add(grId) - module.addComment0("Emit load for %s subtile: [%u, %u]"%(tc, i, j)) - module.add(emitSubtileBufferLoad(tc, writer, kernel, [i, j])) - else: - module.addComment0("Emit load for %s subtile: [%u, %u] - already covered"%(tc, i, j)) + module.addComment0("Emit load for %s subtile: [%u, %u]"%(tc, i, j)) + module.add(emitSubtileBufferLoad(tc, writer, kernel, [i, j])) return module @@ -1775,15 +1793,16 @@ def mainLoop(writer, kernel): # new path for PGR=2 pipelining with SubtileBasedScheduler if pgr == 2: - from Tensile.Components.SubtileBasedScheduler import SubtileBasedScheduler, SchedulerConfig, PrefetchMode, VGPRTileReUseStrategy + from Tensile.Components.SubtileBasedScheduler import SubtileBasedScheduler, SchedulerConfig, PrefetchMode tiA = writer.states.a.tileInfo tiB = writer.states.b.tileInfo scaleTiA = writer.states.mxsa.tileInfo if kernel["ProblemType"].get("MXBlockA", 0) else None scaleTiB = writer.states.mxsb.tileInfo if kernel["ProblemType"].get("MXBlockB", 0) else None + # For 320x256, Use 5x1 parition grid. + # cfg = SchedulerConfig(tiA.localSubtileGrid[0]//5, tiB.localSubtileGrid[0], # Use a single partition for now. TODO cfg = SchedulerConfig(tiA.localSubtileGrid[0], tiB.localSubtileGrid[0], - #cfg = SchedulerConfig(tiA.localSubtileGrid[0]//2, tiB.localSubtileGrid[0]//2, - PrefetchMode.HALF_PREFETCH, VGPRTileReUseStrategy.ACROSS_SUBGROUP) + PrefetchMode.HALF_PREFETCH) scheduler = SubtileBasedScheduler(tiA, tiB, cfg, scaleTileInfoA=scaleTiA, scaleTileInfoB=scaleTiB) # scheduler.printSchedule() @@ -1804,10 +1823,12 @@ def mainLoop(writer, kernel): module.addComment0("MAINLOOP") numPartitions = len(scheduler.partitions) - # With scale double buffering, the scale set rotates per partition inside _emitLoop. - # After one iteration (N partitions), the set flips if N is odd → need 2x unrolling. - # If N is even, the set returns to starting position → no unrolling needed. - needsScaleUnroll = scheduler.hasScale and (numPartitions % 2 == 1) + # With scale double buffering, the scale set rotates inside _emitLoop: + # once per partition end + once per subtileK boundary = numSubtileK flips per partition. + # After one iteration (N partitions), total flips = N * numSubtileK. + # If odd → need 2x unrolling. If even → sets return to start, no unrolling needed. + scaleFlipsPerIter = numPartitions * scheduler.numSubtileK + needsScaleUnroll = scheduler.hasScale and (scaleFlipsPerIter % 2 == 1) if needsScaleUnroll: # 2x unrolled mainloop for odd partition count. @@ -1849,7 +1870,7 @@ def mainLoop(writer, kernel): module.add(Label("SkipToNGLL", "")) if scheduler.hasScale: endLabel = Label("SkipToEnd", "") - nllSet = 1 if numPartitions % 2 == 1 else 0 + nllSet = 1 if scaleFlipsPerIter % 2 == 1 else 0 # Even path (or only path when no unrolling): mainloop ended at scaleSet=0. module.add(scheduler._emitLoop(writer, kernel, "NGLL", scheduler.ngllSteps, @@ -1866,7 +1887,7 @@ def mainLoop(writer, kernel): module.add(scheduler._emitLoop(writer, kernel, "NGLL_odd", scheduler.ngllSteps, scaleSet=1)) module.addComment0("NLL (odd)") - nllSetOdd = 0 if numPartitions % 2 == 1 else 1 + nllSetOdd = 0 if scaleFlipsPerIter % 2 == 1 else 1 module.add(scheduler._emitLoop(writer, kernel, "NLL_odd", scheduler.nllSteps, scaleSet=nllSetOdd)) # NLLEarly: reached when counterL<=1 (preloop skip, no NGLL). diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/SubtileBasedScheduler.py b/projects/hipblaslt/tensilelite/Tensile/Components/SubtileBasedScheduler.py index 200fa63f267..dd7e9e1d7d5 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/SubtileBasedScheduler.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/SubtileBasedScheduler.py @@ -1,4 +1,35 @@ -import dataclasses +"""Subtile-based mainloop scheduler. + +The scheduler builds an instruction schedule for the preloop, mainloop, NGLL & NLL. The GEMM is split +into Partitions, again split into subIterK steps. + +Naming conventions: + - Partition: A rectangle of subtiles (partitionSizeA x partitionSizeB) + processed together in one mainloop step. This includes all K dimensions for those subtiles. + - subtileK: K index of the partition's subtile in the Macrotile. + - subIterK: K-index of the MFMA tiles within each subtiles (range [0,1] with current subtile shape) + - MT iteration (macrotile iteration): Which macrotile's data is being + referenced. "n" = current iteration, "n+1" = next iteration, "n+2" = two ahead. + +Scheduling pipeline: + 1. _buildPreloop — Emit initial GR + LR to init the pipeline (preloop). + 2. _buildSubIterK — For each partition and subIterK, build MFMA and LR + modules with VGPR tile assignments. + 3. _insertGROps — Split GR loads across subIterK=0/1 within each partition. + 4. _annotateDependencies — Wire up before/after dependency edges between modules + (WAIT_GR, SYNC, LR_INC, GR_INC, WAIT_LR). + 5. _buildNGLL — Derive the No-Global-Load-Loop from mainloop + (remove GR(n+2) and GR_INC). + 6. _buildNLL — Derive the No-Load-Loop from mainloop + (remove all GR, LR(n+1), and associated sync ops). + +Emission pipeline (called by the kernel writer): + 7. _buildEmittedModules — Convert AnnotatedModules into EmittedModules with + actual GPU instructions and before-link chains. + 8. instructionSchedule — Interleave non-MFMA instructions between MFMAs + using a slot-based placer with pluggable rules. +""" + from enum import Enum, auto from dataclasses import dataclass, field import math @@ -11,7 +42,7 @@ from Tensile.Components.SubtileBasedKernel import globalReadDoScaleSubtile, globalReadScalePtrUpdates from rocisa.code import Module, Label from rocisa.instruction import SWaitCnt, SBarrier, SCmpEQU32, SCmpLeU32, SCBranchSCC1, MFMAInstruction, \ - MXMFMAInstruction, GlobalReadInstruction, LocalReadInstruction, DSLoadB32 + MXMFMAInstruction, LocalReadInstruction, GlobalReadInstruction, DSLoadB32, CommonInstruction from rocisa.container import sgpr, vgpr, DSModifiers @@ -22,12 +53,10 @@ class PrefetchMode(Enum): class VGPRTileReUseStrategy(Enum): - NONE = auto() - ACROSS_SUBGROUP = auto() - WITHIN_SUBGROUP = auto() + ACROSS_PARTITIONS = auto() -class SubgroupOrdering(Enum): +class PartitionOrdering(Enum): COLUMN_MAJOR = auto() SNAKE_COLUMN_MAJOR = auto() @@ -37,8 +66,8 @@ class SchedulerConfig: partitionSizeA: int partitionSizeB: int prefetchMode: PrefetchMode - reuseStrategy: VGPRTileReUseStrategy - ordering: SubgroupOrdering = SubgroupOrdering.COLUMN_MAJOR + reuseStrategy: VGPRTileReUseStrategy = VGPRTileReUseStrategy.ACROSS_PARTITIONS + ordering: PartitionOrdering = PartitionOrdering.COLUMN_MAJOR @dataclass @@ -142,20 +171,24 @@ def totalVGPRTiles(self) -> int: @dataclass class MFMAOp: mtIteration: str # e.g. "n" - subIterK: int + subtileK: int # subtiles K index + subIterK: int # MFMA k index within subtile subtiles: List[Tuple[int, int]] + # Mapping to vgprTiles for values & scales vgprTileMapA: Dict[int, int] vgprTileMapB: Dict[int, int] - scaleMapA: Dict[int, int] = field(default_factory=dict) # scaleGroupIdx → scaleVgprTileId + scaleMapA: Dict[int, int] = field(default_factory=dict) scaleMapB: Dict[int, int] = field(default_factory=dict) @dataclass class GROp: mtIteration: str # e.g. "n+1", "n+2", "0", "1" - subtileA: List[int] - subtileB: List[int] + subtileA: List[int] # M-dim subtile indices + subtileB: List[int] # N-dim subtile indices + subtileK: int = 0 # subtiles K index lastForMT: bool = False # True = last partition's GR for this MT → emit ptrUpdate+swap + firstForMT: bool = False # True = first partition's GR for this MT → emit scale loads @dataclass @@ -165,6 +198,8 @@ class WaitGROp: subtileB: List[int] inflightLoadsA: Optional[int] = None inflightLoadsB: Optional[int] = None + inflightScaleLoadsA: int = 0 + inflightScaleLoadsB: int = 0 @dataclass @@ -179,7 +214,8 @@ class SyncOp: @dataclass class LROp: - mtIteration: str # e.g. "n", "n+1", "0" + mtIteration: str + subtileK: int subIterK: int lrLoadA: Dict[int, int] lrLoadB: Dict[int, int] @@ -209,6 +245,35 @@ class LR_INCOp: ScheduleOp = Union[MFMAOp, GROp, WaitGROp, WaitLROp, SyncOp, LROp, SkipOp, GR_INCOp, LR_INCOp] +@dataclass +class DepEdge: + """A synchronization/housekeeping op or module reference that acts as a dependency edge. + + Either op or module is set, not both: + - op: a sync/housekeeping instruction to emit (WAIT_GR, WAIT_LR, SYNC, etc.) + - module: a reference to another AnnotatedModule that must complete first (ordering constraint) + """ + op: Union[WaitGROp, WaitLROp, SyncOp, GR_INCOp, LR_INCOp, None] = None + module: Optional['AnnotatedModule'] = None + + +@dataclass +class AnnotatedModule: + """A module with dependency edges.""" + op: ScheduleOp + before: List[DepEdge] = field(default_factory=list) + after: List[DepEdge] = field(default_factory=list) + + +@dataclass +class EmittedModule: + """One emitted module with instructions and module-link deps.""" + moduleId: int = -1 + instructions: list = field(default_factory=list) + before: Optional[int] = None # moduleId that must run before this module + opType: str = "" + + @dataclass class PartitionGR: """Describes the GR (Global Read) issued by a partition during the mainloop.""" @@ -220,10 +285,10 @@ class PartitionGR: @dataclass class SubIterKSchedule: - """Ops for one subIterK iteration within a partition.""" - subIterK: int - ops: List[ScheduleOp] = field(default_factory=list) - conflict: Set[int] = field(default_factory=set) + """Ops for one (subtileK, subIterK) step within a partition.""" + subtileK: int # subtile K index (0..numSubtileK-1) + subIterK: int # localK within subtile (0..subtileShapeK-1) + modules: List[AnnotatedModule] = field(default_factory=list) @dataclass @@ -233,6 +298,268 @@ class PartitionSchedule: subIterKSteps: List[SubIterKSchedule] = field(default_factory=list) +class _SlotPlacer: + """Generic slot placement engine for interleaving instructions between MFMAs. + + Each interval (pair of adjacent MFMAs) has 2 placement slots. + Rules are injected via callbacks: + - validators: (placer, pos, inst) -> bool — reject invalid slots + - adjusters: (placer, limit, inst) -> limit — shift search start + - onPlace: (placer, pos, inst) -> None — update rule state after placement + """ + + def __init__(self, intervals: int, numModules: int, + pathOrders: List[List[int]], + validators=None, adjusters=None, onPlace=None): + self.totalSlots = intervals * 2 + self._n = numModules + self._prevInPath: List[int] = [-1] * numModules + self._nextInPath: List[int] = [-1] * numModules + for order in pathOrders: + for a, b in zip(order, order[1:]): + self._prevInPath[b] = a + self._nextInPath[a] = b + self._validators = validators or [] + self._adjusters = adjusters or [] + self._onPlace = onPlace + + self._placed: List[List[Tuple[int, object]]] = [[] for _ in range(self.totalSlots)] + self._firstPos: List[Optional[int]] = [None] * numModules + self._lastPos: List[Optional[int]] = [None] * numModules + self.leftovers: List[Tuple[int, object]] = [] + + # ── Placement ── + + def _canPlace(self, pos: int, inst) -> bool: + if pos < 0 or pos >= self.totalSlots or len(self._placed[pos]) >= 2: + return False + return all(v(self, pos, inst) for v in self._validators) + + def adjustLimit(self, limit: int, inst) -> int: + for adj in self._adjusters: + limit = adj(self, limit, inst) + return limit + + def bounds(self, mid: int) -> Tuple[int, int]: + lo = 0 + pred = self._prevInPath[mid] + if 0 <= pred < self._n and self._lastPos[pred] is not None: + lo = self._lastPos[pred] + 1 + hi = self.totalSlots - 1 + succ = self._nextInPath[mid] + if 0 <= succ < self._n and self._firstPos[succ] is not None: + hi = self._firstPos[succ] - 1 + return lo, hi + + def findSlot(self, mid: int, inst, limit: int, reverse: bool = False) -> Optional[int]: + lo, hi = self.bounds(mid) + if reverse: + hi = min(hi, limit) + else: + lo = max(lo, limit) + if hi < lo: + return None + for pos in (range(hi, lo - 1, -1) if reverse else range(lo, hi + 1)): + if self._canPlace(pos, inst): + return pos + return None + + def _forceSlot(self, mid: int, limit: int, reverse: bool) -> int: + """Find the closest valid slot respecting dependencies, allowing >2 items per slot.""" + lo, hi = self.bounds(mid) + if reverse: + hi = min(hi, limit) + lo = max(lo, 0) + if hi < lo: + hi = lo + return hi + else: + lo = max(lo, limit) + hi = min(hi, self.totalSlots - 1) + if lo > hi: + lo = hi + return lo + + def place(self, pos: int, item: Tuple[int, object], reverse: bool = False): + mid = item[0] + if reverse: + self._placed[pos].insert(0, item) + else: + self._placed[pos].append(item) + if self._firstPos[mid] is None or pos < self._firstPos[mid]: + self._firstPos[mid] = pos + if self._lastPos[mid] is None or pos > self._lastPos[mid]: + self._lastPos[mid] = pos + if self._onPlace: + self._onPlace(self, pos, item[1]) + + def placePath(self, pathInsts: List[Tuple[int, object]], reverse: bool = False): + """Place a sequence of (moduleId, instruction) items into slots. + + Walks pathInsts in order, applying adjusters (forward only) and + finding valid slots. When no empty slot is found, force-places at + the closest valid position respecting dependencies (allowing >2 + items per slot). + """ + limit = (self.totalSlots - 1) if reverse else 0 + for idx, item in enumerate(pathInsts): + mid, inst = item + if not reverse: + limit = self.adjustLimit(limit, inst) + pos = self.findSlot(mid, inst, limit, reverse=reverse) + if pos is None: + pos = self._forceSlot(mid, limit, reverse) + self.place(pos, item, reverse=reverse) + limit = (pos - 1) if reverse else (pos + 1) + + # ── Assembly ── + + def assemble(self, mfmas) -> Module: + intervals = len(mfmas) - 1 + result = Module() + result.add(mfmas[0]) + for i in range(intervals): + for slot in (2 * i, 2 * i + 1): + for item in self._placed[slot]: + result.add(item[1]) + result.add(mfmas[i + 1]) + for _, inst in self.leftovers: + result.add(inst) + return result + + +# ── Scheduling rules ── + +# Hardcoded gap to hide ds_read latency. TODO: compute this more accurately. +_MIN_MFMA_GAP_DS_READ_TO_WAIT = 4 + +_isDsRead = lambda x: isinstance(x, LocalReadInstruction) +_isBufferLoad = lambda x: isinstance(x, GlobalReadInstruction) +_isWaitCnt = lambda x: isinstance(x, SWaitCnt) +_isM0Update = lambda x: isinstance(x, CommonInstruction) and hasattr(x, 'dst') and hasattr(x.dst, 'regType') and x.dst.regType == 'm' + + +class _SchedulingRules: + """Scheduling rules for slot placement: validators, adjusters, and placement hooks. + + Owns all rule state (ds_read/waitcnt tracking, buffer-load spreading). + Bound methods are passed as callbacks to _SlotPlacer. + """ + + def __init__(self, totalSlots: int): + # Cross-path state + self.lastDsReadPos = -1 + self.earliestWaitCntPos = totalSlots + # Per-path state + self._resetPath() + + def _resetPath(self): + self.firstBufLoadPos: Optional[int] = None + self.bufLoadIdx = 0 + self.bufLoadMaxSlot = 0 + self.numBufLoads = 0 + + # ── Validators: (placer, pos, inst) -> bool ── + + def oneDsReadPerInterval(self, placer, pos, inst): + """At most one ds_read per interval (pair of slots) to avoid same SIMD pair stalls as we have a single codepath""" + if not _isDsRead(inst): + return True + peer = pos ^ 1 + return not (0 <= peer < placer.totalSlots + and any(_isDsRead(item[1]) for item in placer._placed[peer])) + + def minGapDsReadBeforeWait(self, placer, pos, inst): + """Reject ds_read too close to an already-placed waitcnt ahead.""" + if not _isDsRead(inst): + return True + gap = _MIN_MFMA_GAP_DS_READ_TO_WAIT * 2 + return self.earliestWaitCntPos - pos >= gap + + def minGapDsReadToWait(self, placer, pos, inst): + """Reject waitcnt too close to the last placed ds_read.""" + if not _isWaitCnt(inst) or self.lastDsReadPos < 0: + return True + gap = _MIN_MFMA_GAP_DS_READ_TO_WAIT * 2 + return pos - self.lastDsReadPos >= gap + + def noM0WithBufferLoad(self, placer, pos, inst): + """Avoid placing M0 updates and buffer_loads in the same MFMA interval.""" + if not _isM0Update(inst) and not _isBufferLoad(inst): + return True + peer = pos ^ 1 + slots = [pos] + if 0 <= peer < placer.totalSlots: + slots.append(peer) + if _isM0Update(inst): + return not any(_isBufferLoad(item[1]) for s in slots for item in placer._placed[s]) + return not any(_isM0Update(item[1]) for s in slots for item in placer._placed[s]) + + # ── Adjusters: (placer, limit, inst) -> limit ── + + def spreadBufferLoads(self, placer, limit, inst): + """Spread buffer_load instructions evenly across available range.""" + if not _isBufferLoad(inst) or self.bufLoadMaxSlot <= 0: + return limit + if self.firstBufLoadPos is not None: + stride = max(1, (self.bufLoadMaxSlot - self.firstBufLoadPos) + // self.numBufLoads) + limit = max(limit, self.firstBufLoadPos + + self.bufLoadIdx * stride) + self.bufLoadIdx += 1 + return limit + + # ── Placement hook: (placer, pos, inst) -> None ── + + def trackPlacement(self, placer, pos, inst): + """Update rule state after a successful placement.""" + if _isDsRead(inst): + self.lastDsReadPos = max(self.lastDsReadPos, pos) + if _isWaitCnt(inst): + self.earliestWaitCntPos = min(self.earliestWaitCntPos, pos) + if _isBufferLoad(inst) and self.firstBufLoadPos is None: + self.firstBufLoadPos = pos + + # ── Per-path setup ── + + def resetPath(self): + self._resetPath() + + def setupBufLoadSpreading(self, placer, pathInsts, order): + """Compute buffer-load spreading bounds for a forward path. + + Reserves tail slots for non-buffer-load instructions in modules that + follow the last GR module (e.g. GR_INC SRD updates, LDS buffer swaps). + """ + self.numBufLoads = sum(1 for _, inst in pathInsts if _isBufferLoad(inst)) + if self.numBufLoads > 1: + _, rawMax = placer.bounds(pathInsts[-1][0]) + grModuleIds = {mid for mid, inst in pathInsts if _isBufferLoad(inst)} + lastGrIdx = max(order.index(m) for m in grModuleIds if m in order) + tailModuleIds = set(order[lastGrIdx + 1:]) + numTailInsts = sum(1 for mid, _ in pathInsts if mid in tailModuleIds) + # this is an approximation as we don't know exactly how many slots will be use by modules after the GR yet (in this codepath) + self.bufLoadMaxSlot = max(0, rawMax - numTailInsts) + + +def _classifyPaths(pathOrders, emittedModules): + """Classify paths by wait_gr presence, sorted: wait_gr first, then by index.""" + paths = [] + for order in pathOrders: + hasWaitGR = any(emittedModules[i].opType == "wait_gr" for i in order) + paths.append((order, hasWaitGR)) + paths.sort(key=lambda p: (0 if p[1] else 1, p[0][0] if p[0] else 10**9)) + return paths + + +def _flattenPath(order, emittedModules, reverse=False): + """Flatten a path of module indices into (moduleId, instruction) pairs.""" + pathInsts = [(mid, inst) for mid in order for inst in emittedModules[mid].instructions] + if reverse: + pathInsts.reverse() + return pathInsts + + class SubtileBasedScheduler: def __init__(self, tileInfoA, tileInfoB, config: SchedulerConfig, scaleTileInfoA=None, scaleTileInfoB=None): @@ -241,6 +568,9 @@ def __init__(self, tileInfoA, tileInfoB, config: SchedulerConfig, self.scaleTileInfoA = scaleTileInfoA self.scaleTileInfoB = scaleTileInfoB self.hasScale = scaleTileInfoA is not None and scaleTileInfoB is not None + # Number of scale loads per MT (one load covers the entire MT) + self.scaleLoadsPerMT_A = 1 if self.hasScale else 0 + self.scaleLoadsPerMT_B = 1 if self.hasScale else 0 self.config = config self.MTA = tileInfoA.localSubtileGrid[0] @@ -254,20 +584,18 @@ def __init__(self, tileInfoA, tileInfoB, config: SchedulerConfig, self.numPartitionsA = self.MTA // config.partitionSizeA self.numPartitionsB = self.MTB // config.partitionSizeB - self.numSubIterK = tileInfoA.subtileShape[1] - assert self.numSubIterK == tileInfoB.subtileShape[1], \ + self.subtileShapeK = tileInfoA.subtileShape[1] + assert self.subtileShapeK == tileInfoB.subtileShape[1], \ "A and B must have same subtileShape[1]" - assert tileInfoA.localSubtileGrid[1] == 1, \ - f"Scheduler requires localSubtileGrid[1]==1 for A, got {tileInfoA.localSubtileGrid[1]}" - assert tileInfoB.localSubtileGrid[1] == 1, \ - f"Scheduler requires localSubtileGrid[1]==1 for B, got {tileInfoB.localSubtileGrid[1]}" + self.numSubtileK = tileInfoA.localSubtileGrid[1] + self.numSubIterK = self.subtileShapeK # intra-subtile K steps only self.partitions: List[Partition] = self._buildPartitions() self.allocator = VGPRTileAllocator() - self.hasDuplicatedReads: bool = False - self.needsUnrolling: bool = False # Scale VGPR tile IDs are deterministic: gid for A, numScaleGroupsA + gid for B. + # Each scale VGPR covers 2 M-adjacent subtiles × 2 localK (4 E8M0 bytes). + # Subtile K indices reuse the same VGPRs (consumed sequentially). self.numScaleGroupsA = math.ceil(self.MTA / 2) if self.hasScale else 0 self.numScaleGroupsB = math.ceil(self.MTB / 2) if self.hasScale else 0 self.totalScaleVGPRTiles = self.numScaleGroupsA + self.numScaleGroupsB @@ -282,7 +610,8 @@ def totalVGPRTiles(self) -> int: def scaleVid(self, tc: str, subtileIdx: int) -> Tuple[int, int]: """Deterministic scale VGPR tile ID for the group containing subtileIdx. - Returns (scaleGroupIdx, vgprTileId).""" + Returns (scaleGroupIdx, vgprTileId). + Subtile K indices reuse the same VGPRs — they are reloaded per subtileK.""" gid = subtileIdx // 2 vid = gid if tc == 'A' else self.numScaleGroupsA + gid return gid, vid @@ -291,11 +620,11 @@ def scaleVid(self, tc: str, subtileIdx: int) -> Tuple[int, int]: def _generateOrder(self) -> List[Tuple[int, int]]: order = [] - if self.config.ordering == SubgroupOrdering.COLUMN_MAJOR: + if self.config.ordering == PartitionOrdering.COLUMN_MAJOR: for col in range(self.numPartitionsB): for row in range(self.numPartitionsA): order.append((row, col)) - elif self.config.ordering == SubgroupOrdering.SNAKE_COLUMN_MAJOR: + elif self.config.ordering == PartitionOrdering.SNAKE_COLUMN_MAJOR: for col in range(self.numPartitionsB): if col % 2 == 0: for row in range(self.numPartitionsA): @@ -322,6 +651,8 @@ def _buildPartitions(self) -> List[Partition]: def _computePartitionGRs(self, preloadedMTn1_A: Set[int], preloadedMTn1_B: Set[int]) -> Dict[int, PartitionGR]: """Compute each partition's GR target (mtIteration, targetPartition, subtiles). + Current behavior is : load MT n+1, partition + 1. + TODO. Change this to allow better GR spreading accross partition when using multi-partitions config Args: preloadedMTn1_A/B: MT n+1 subtiles already loaded by the preloop's GR(MT 1). @@ -367,40 +698,44 @@ def _buildPreloop(self) -> Tuple[Set[int], Set[int]]: preloadMT1_A = list(first.tileAIndices) preloadMT1_B = list(first.tileBIndices) - # Number of subIterK to preload LR for + # Preload range: HALF preloads (subtileK=0, localK=0) only; FULL preloads all if self.config.prefetchMode == PrefetchMode.HALF_PREFETCH: - numPreloadSubIterKs = 1 + numPreloadSId1s = 1 + numPreloadLocalKs = 1 elif self.config.prefetchMode == PrefetchMode.FULL_PREFETCH: - numPreloadSubIterKs = self.numSubIterK + numPreloadSId1s = self.numSubtileK + numPreloadLocalKs = self.numSubIterK # Allocate VGPRs for first group and build LR maps lrOps = [] - for sik in range(numPreloadSubIterKs): - lrLoadA = {} - lrLoadB = {} - for tA in first.tileAIndices: - lrLoadA[tA] = self.allocator.allocate('A', tA, sik) - for tB in first.tileBIndices: - lrLoadB[tB] = self.allocator.allocate('B', tB, sik) - - # Allocate scale VGPRs at subIterK==0 (scale is constant across subIterK) - lrScaleA = {} - lrScaleB = {} - if self.hasScale and sik == 0: + for subtileK in range(numPreloadSId1s): + for localK in range(numPreloadLocalKs): + flatK = subtileK * self.subtileShapeK + localK + lrLoadA = {} + lrLoadB = {} for tA in first.tileAIndices: - gid, vid = self.scaleVid('A', tA) - lrScaleA.setdefault(gid, vid) + lrLoadA[tA] = self.allocator.allocate('A', tA, flatK) for tB in first.tileBIndices: - gid, vid = self.scaleVid('B', tB) - lrScaleB.setdefault(gid, vid) + lrLoadB[tB] = self.allocator.allocate('B', tB, flatK) + + # Allocate scale VGPRs at the start of each subtile K index + lrScaleA = {} + lrScaleB = {} + if self.hasScale and localK == 0: + for tA in first.tileAIndices: + gid, vid = self.scaleVid('A', tA) + lrScaleA.setdefault(gid, vid) + for tB in first.tileBIndices: + gid, vid = self.scaleVid('B', tB) + lrScaleB.setdefault(gid, vid) - lrOps.append(LROp(mtIteration="0", subIterK=sik, - lrLoadA=lrLoadA, lrLoadB=lrLoadB, - lrScaleA=lrScaleA, lrScaleB=lrScaleB)) + lrOps.append(LROp(mtIteration="0", subtileK=subtileK, subIterK=localK, + lrLoadA=lrLoadA, lrLoadB=lrLoadB, + lrScaleA=lrScaleA, lrScaleB=lrScaleB)) - # Build preloop steps: GR(MT0) split by partition, WAIT, LR(MT0), SKIP guards, GR(MT1) + # Build preloop steps: GR(MT0) split by partition × subtileK, WAIT, LR(MT0), SKIP guards, GR(MT1) preloopOps: List[ScheduleOp] = [] - # Split MT 0 GR by partition with dedup (same order as mainloop) + # Split MT 0 GR by partition with dedup, one GR per (partition, subtileK) loadedA: Set[int] = set() loadedB: Set[int] = set() for partition in self.partitions: @@ -409,13 +744,19 @@ def _buildPreloop(self) -> Tuple[Set[int], Set[int]]: loadedA.update(partition.tileAIndices) loadedB.update(partition.tileBIndices) if grA or grB: - preloopOps.append(GROp(mtIteration="0", - subtileA=grA, subtileB=grB, - lastForMT=False)) + for subtileK in range(self.numSubtileK): + preloopOps.append(GROp(mtIteration="0", + subtileA=grA, subtileB=grB, subtileK=subtileK, + lastForMT=False)) + # Mark the first MT 0 GR as firstForMT + for i in range(len(preloopOps)): + if isinstance(preloopOps[i], GROp): + preloopOps[i].firstForMT = True + break # Mark the last MT 0 GR as lastForMT for i in range(len(preloopOps) - 1, -1, -1): if isinstance(preloopOps[i], GROp): - preloopOps[i] = dataclasses.replace(preloopOps[i], lastForMT=True) + preloopOps[i].lastForMT = True break preloopOps.append(GR_INCOp()) preloopOps.append(WaitGROp(mtIteration="0", @@ -429,19 +770,162 @@ def _buildPreloop(self) -> Tuple[Set[int], Set[int]]: nllTarget = "NLLEarly" if self.hasScale else "NLL" preloopOps.append(SkipOp(compare="LE", value=1, target=nllTarget)) mt1Complete = (set(preloadMT1_A) == set(allA) and set(preloadMT1_B) == set(allB)) - preloopOps.append(GROp(mtIteration="1", - subtileA=preloadMT1_A, subtileB=preloadMT1_B, - lastForMT=mt1Complete)) + for subtileK in range(self.numSubtileK): + isFirstSId1 = (subtileK == 0) + isLastSId1 = (subtileK == self.numSubtileK - 1) + preloopOps.append(GROp(mtIteration="1", + subtileA=preloadMT1_A, subtileB=preloadMT1_B, subtileK=subtileK, + firstForMT=isFirstSId1, lastForMT=mt1Complete and isLastSId1)) if mt1Complete: preloopOps.append(GR_INCOp()) preloopOps.append(SkipOp(compare="LE", value=2, target="NGLL")) - preloopSik = SubIterKSchedule(subIterK=0) - preloopSik.ops = preloopOps + preloopSik = SubIterKSchedule(subtileK=0, subIterK=0) + preloopSik.modules = [AnnotatedModule(op=op) for op in preloopOps] self.preloopSteps: List[PartitionSchedule] = [ PartitionSchedule(partitionId=0, subIterKSteps=[preloopSik])] return set(preloadMT1_A), set(preloadMT1_B) + def _buildSubIterK(self, partition, pi, subtileK, localK, numPartitions): + """Build MFMA + LR modules for one (subtileK, localK) step within a partition.""" + flatK = subtileK * self.subtileShapeK + localK + # MFMA: map subtile indices to VGPR tile IDs + vgprTileMapA = {tA: self.allocator.getVGPRTileId('A', tA, flatK) + for tA in partition.tileAIndices} + vgprTileMapB = {tB: self.allocator.getVGPRTileId('B', tB, flatK) + for tB in partition.tileBIndices} + + # MFMA scale maps — same VGPRs reloaded per subtile K index + scaleMapA, scaleMapB = {}, {} + if self.hasScale: + for tA in partition.tileAIndices: + gid, vid = self.scaleVid('A', tA) + scaleMapA.setdefault(gid, vid) + for tB in partition.tileBIndices: + gid, vid = self.scaleVid('B', tB) + scaleMapB.setdefault(gid, vid) + + # LR: load targets determined by prefetch mode + loadATiles, loadBTiles, targetSId1, targetLocalK = self._getLoadTargets(pi, subtileK, localK, numPartitions) + isWrapAround = self._isWrapAroundLoad(pi, subtileK, localK, numPartitions) + loadFlatK = targetSId1 * self.subtileShapeK + targetLocalK + + lrLoadA = {tA: v for tA in (loadATiles or []) + if (v := self._loadTile('A', tA, loadFlatK, isWrapAround)) is not None} + lrLoadB = {tB: v for tB in (loadBTiles or []) + if (v := self._loadTile('B', tB, loadFlatK, isWrapAround)) is not None} + + # Scale VGPRs for loaded tiles (at the start of each subtile K index) + lrScaleA, lrScaleB = {}, {} + if self.hasScale and targetLocalK == 0: + for tA in (loadATiles or []): + gid, vid = self.scaleVid('A', tA) + lrScaleA.setdefault(gid, vid) + for tB in (loadBTiles or []): + gid, vid = self.scaleVid('B', tB) + lrScaleB.setdefault(gid, vid) + + # Conflict detection: MFMA reads and LR writes must not share VGPR tiles + mfmaIds = set(vgprTileMapA.values()) | set(vgprTileMapB.values()) + loadIds = set(lrLoadA.values()) | set(lrLoadB.values()) + overlap = mfmaIds & loadIds + if overlap: + raise RuntimeError( + f"VGPR tile conflict in partition {partition.partitionId} subtileK={subtileK} localK={localK}: " + f"MFMA and LR share tile IDs {overlap}") + + # Build modules + mfmas = [(a, b) for a in sorted(vgprTileMapA) for b in sorted(vgprTileMapB)] + mtLoad = "n+1" if isWrapAround else "n" + siks = SubIterKSchedule(subtileK=subtileK, subIterK=localK) + siks.modules.append(AnnotatedModule(op=MFMAOp( + mtIteration="n", subtileK=subtileK, subIterK=localK, subtiles=mfmas, + vgprTileMapA=vgprTileMapA, vgprTileMapB=vgprTileMapB, + scaleMapA=scaleMapA, scaleMapB=scaleMapB))) + siks.modules.append(AnnotatedModule(op=LROp( + mtIteration=mtLoad, subtileK=targetSId1, subIterK=targetLocalK, + lrLoadA=lrLoadA, lrLoadB=lrLoadB, + lrScaleA=lrScaleA, lrScaleB=lrScaleB))) + + return siks + + def _isLastGRForMT(self, pi, gr, numPartitions): + """Check if this partition's GR is the last one that completes a full MT load.""" + if gr.mtIteration == "n+1": + return not any( + (self.partitionGRs[fpi].subtileA or self.partitionGRs[fpi].subtileB) + and self.partitionGRs[fpi].mtIteration == "n+1" + for fpi in range(pi + 1, numPartitions)) + if gr.mtIteration == "n+2": + hasN1 = any( + (self.partitionGRs[p].subtileA or self.partitionGRs[p].subtileB) + and self.partitionGRs[p].mtIteration == "n+1" + for p in range(numPartitions)) + if hasN1: + return False + return not any( + (self.partitionGRs[fpi].subtileA or self.partitionGRs[fpi].subtileB) + and self.partitionGRs[fpi].mtIteration == "n+2" + for fpi in range(pi + 1, numPartitions)) + return False + + def _isFirstGRForMT(self, pi, gr, numPartitions): + """Check if this partition's GR is the first one for its MT iteration.""" + if gr.mtIteration in ("n+1", "n+2"): + return not any( + (self.partitionGRs[fpi].subtileA or self.partitionGRs[fpi].subtileB) + and self.partitionGRs[fpi].mtIteration == gr.mtIteration + for fpi in range(0, pi)) + return False + + def _insertGROps(self, pss, pi, gr, numPartitions): + """Insert GR ops for a partition, one per (M-subtile-chunk, subtileK), spread across steps.""" + if not gr.subtileA and not gr.subtileB: + return + + totalGR_A = sorted(gr.subtileA) + totalGR_B = sorted(gr.subtileB) + isLast = self._isLastGRForMT(pi, gr, numPartitions) + isFirst = self._isFirstGRForMT(pi, gr, numPartitions) + + # Split M-dim subtiles into min(numSubIterK, 2) chunks, then replicate per subtileK. + numMSplits = min(self.numSubIterK, 2) + def _splitEvenly(items, n): + k, r = divmod(len(items), n) + chunks, start = [], 0 + for i in range(n): + end = start + k + (1 if i < r else 0) + chunks.append(items[start:end]) + start = end + return chunks + + mChunksA = _splitEvenly(totalGR_A, numMSplits) + mChunksB = _splitEvenly(totalGR_B, numMSplits) + + # Build flat list of (stepIdx, GROp) — one per (mChunk, subtileK) + grOps = [] + totalSteps = len(pss.subIterKSteps) + stepIdx = 0 + for subtileK in range(self.numSubtileK): + for mIdx in range(numMSplits): + cA, cB = mChunksA[mIdx], mChunksB[mIdx] + if cA or cB: + grOps.append((min(stepIdx, totalSteps - 1), subtileK, cA, cB)) + stepIdx += 1 + + # Assign firstForMT / lastForMT + for i, (si, subtileK, cA, cB) in enumerate(grOps): + pss.subIterKSteps[si].modules.append(AnnotatedModule(op=GROp( + mtIteration=gr.mtIteration, + subtileA=cA, subtileB=cB, subtileK=subtileK, + firstForMT=isFirst and i == 0, + lastForMT=isLast and i == len(grOps) - 1))) + + # Generate the schedule + # 1- build subIterK steps + # 2- insert GR ops + # 3- annotate dependencies (WAIT and INC Ops) + # 4- build NGLL & NLL using mainloop schedule def _runSchedule(self): if self.config.prefetchMode == PrefetchMode.NO: raise NotImplementedError("PrefetchMode.NO is not yet supported") @@ -454,177 +938,25 @@ def _runSchedule(self): for pi, partition in enumerate(self.partitions): pss = PartitionSchedule(partitionId=partition.partitionId) - gr = self.partitionGRs[pi] - subIterK0LoadAKeys: Set[int] = set() - subIterK0LoadBKeys: Set[int] = set() - - for sik in range(self.numSubIterK): - # USE: current group's tiles at current subIterK - # MFMA: map subtile indices to VGPR tile IDs - vgprTileMapA = {} - vgprTileMapB = {} - for tA in partition.tileAIndices: - vgprTileMapA[tA] = self.allocator.getVGPRTileId('A', tA, sik) - for tB in partition.tileBIndices: - vgprTileMapB[tB] = self.allocator.getVGPRTileId('B', tB, sik) - - # MFMA scale maps: look up already-allocated scale VGPR tile IDs - scaleMapA = {} - scaleMapB = {} - if self.hasScale: - for tA in partition.tileAIndices: - gid, vid = self.scaleVid('A', tA) - scaleMapA.setdefault(gid, vid) - for tB in partition.tileBIndices: - gid, vid = self.scaleVid('B', tB) - scaleMapB.setdefault(gid, vid) - # LOAD: determined by prefetch mode - loadATiles, loadBTiles, loadSubIterK = self._getLoadTargets(pi, sik, numPartitions) - isWrapAround = self._isWrapAroundLoad(pi, sik, numPartitions) - curA = set(partition.tileAIndices) - curB = set(partition.tileBIndices) - if sik == 0: - self._pendingRemap = [] - - lrLoadA = {} - lrLoadB = {} - if loadATiles is not None: - for tA in loadATiles: - vid = self._loadTile('A', tA, loadSubIterK, isWrapAround, curA) - if vid is not None: - lrLoadA[tA] = vid - - if loadBTiles is not None: - for tB in loadBTiles: - vid = self._loadTile('B', tB, loadSubIterK, isWrapAround, curB) - if vid is not None: - lrLoadB[tB] = vid - - # Allocate scale VGPRs for loaded tiles (only when loading subIterK==0 data, - # since scale data is constant across subIterK within one MT iteration). - # Use loadSubIterK (not sik) because wrap-around loads target subIterK 0 - # even though the current partition's sik may be > 0. - lrScaleA = {} - lrScaleB = {} - if self.hasScale and loadSubIterK == 0: - if loadATiles is not None: - for tA in loadATiles: - gid, vid = self.scaleVid('A', tA) - lrScaleA.setdefault(gid, vid) - if loadBTiles is not None: - for tB in loadBTiles: - gid, vid = self.scaleVid('B', tB) - lrScaleB.setdefault(gid, vid) - - # Check MFMA and LOAD VGPRTile IDs don't overlap - mfmaIds = set(vgprTileMapA.values()) | set(vgprTileMapB.values()) - loadIds = set(lrLoadA.values()) | set(lrLoadB.values()) - overlap = mfmaIds & loadIds - conflict = set() - if overlap: - conflict = overlap - self.needsUnrolling = True - - # Build SubIterKSchedule with MFMA and LR ops - siks = SubIterKSchedule(subIterK=sik) - mfmas = [(a, b) for a in sorted(vgprTileMapA.keys()) for b in sorted(vgprTileMapB.keys())] - mtLoad = "n+1" if isWrapAround else "n" - siks.ops.append(MFMAOp(mtIteration="n", subIterK=sik, - subtiles=mfmas, - vgprTileMapA=vgprTileMapA, vgprTileMapB=vgprTileMapB, - scaleMapA=scaleMapA, scaleMapB=scaleMapB)) - siks.ops.append(LROp(mtIteration=mtLoad, subIterK=loadSubIterK, - lrLoadA=lrLoadA, lrLoadB=lrLoadB, - lrScaleA=lrScaleA, lrScaleB=lrScaleB)) - siks.conflict = conflict - pss.subIterKSteps.append(siks) - - # save subtiles for subIterK=0 to check where to insert GR(n+2) - if sik == 0: - subIterK0LoadAKeys = set(lrLoadA.keys()) - subIterK0LoadBKeys = set(lrLoadB.keys()) - - # WITHIN_SUBGROUP: release current subIterK's MFMA tiles for K-dim reuse - if self.config.reuseStrategy == VGPRTileReUseStrategy.WITHIN_SUBGROUP: - for tA in vgprTileMapA: - if self.allocator.isAllocated('A', tA, sik): - self.allocator.release('A', tA, sik) - for tB in vgprTileMapB: - if self.allocator.isAllocated('B', tB, sik): - self.allocator.release('B', tB, sik) - - # Insert GROps split across subIterK=0 and subIterK=1 - if gr.subtileA or gr.subtileB: - totalGR_A = sorted(gr.subtileA) - totalGR_B = sorted(gr.subtileB) - splitA = (len(totalGR_A) + 1) // 2 - splitB = (len(totalGR_B) + 1) // 2 - gr0_A, gr1_A = totalGR_A[:splitA], totalGR_A[splitA:] - gr0_B, gr1_B = totalGR_B[:splitB], totalGR_B[splitB:] - - # lastForMT: true for the last GR that completes a full MT load - # within this loop iteration. One GR_INC per loop iteration. - # - For n+1 GRs: true when no more n+1 GRs follow. - # - For n+2 GRs: true only when there are no n+1 GRs at all - # (1 partition case where n+2 loads all subtiles in one shot). - # Otherwise n+2 is partial and continues in the next iteration. - isLastForThisMT = False - if gr.mtIteration == "n+1": - isLastForThisMT = True - for fpi in range(pi + 1, numPartitions): - fgr = self.partitionGRs[fpi] - if (fgr.subtileA or fgr.subtileB) and fgr.mtIteration == "n+1": - isLastForThisMT = False - break - elif gr.mtIteration == "n+2": - # n+2 gets GR_INC only if no n+1 GRs exist (single partition) - hasN1 = any((self.partitionGRs[p].subtileA or self.partitionGRs[p].subtileB) - and self.partitionGRs[p].mtIteration == "n+1" - for p in range(numPartitions)) - if not hasN1: - # Check this is the last n+2 GR - isLastForThisMT = True - for fpi in range(pi + 1, numPartitions): - fgr = self.partitionGRs[fpi] - if (fgr.subtileA or fgr.subtileB) and fgr.mtIteration == "n+2": - isLastForThisMT = False - break - - if gr0_A or gr0_B: - pss.subIterKSteps[0].ops.append(GROp( - mtIteration=gr.mtIteration, - subtileA=gr0_A, subtileB=gr0_B, - lastForMT=False)) - if gr1_A or gr1_B: - pss.subIterKSteps[1].ops.append(GROp( - mtIteration=gr.mtIteration, - subtileA=gr1_A, subtileB=gr1_B, - lastForMT=isLastForThisMT)) - elif isLastForThisMT: - # All GRs fit in subIterK=0, mark that one as last - pss.subIterKSteps[0].ops[-1] = dataclasses.replace( - pss.subIterKSteps[0].ops[-1], lastForMT=True) + for subtileK in range(self.numSubtileK): + for localK in range(self.numSubIterK): + pss.subIterKSteps.append( + self._buildSubIterK(partition, pi, subtileK, localK, numPartitions)) + # split GR ops across subIterK steps and determine lastForMT + self._insertGROps(pss, pi, self.partitionGRs[pi], numPartitions) self.mainloopSteps.append(pss) - # Release after partition based on strategy - if self.config.reuseStrategy == VGPRTileReUseStrategy.WITHIN_SUBGROUP: - for tc, tileIdx, subIterK, shadowKey in self._pendingRemap: - vid = self.allocator._allocMap(tc).pop((shadowKey, subIterK)) - self.allocator._allocMap(tc)[(tileIdx, subIterK)] = vid - self._pendingRemap = [] - elif self.config.reuseStrategy == VGPRTileReUseStrategy.ACROSS_SUBGROUP: + if self.config.reuseStrategy == VGPRTileReUseStrategy.ACROSS_PARTITIONS: self._releaseUnusedAfterPartition(pi) - self._insertWaitsAndSync(numPartitions) + self._annotateDependencies(numPartitions) self.ngllSteps = self._buildNGLL() self.nllSteps = self._buildNLL() - self._checkDuplicatedReads() def _loadTile(self, tc: str, tileIdx: int, loadSubIterK: int, - isWrapAround: bool, - currentPartitionTiles: Set[int]) -> Optional[int]: + isWrapAround: bool) -> Optional[int]: """Determine the VGPRTile ID for a load. Returns None if no load needed.""" allocated = self.allocator.isAllocated(tc, tileIdx, loadSubIterK) @@ -633,63 +965,55 @@ def _loadTile(self, tc: str, tileIdx: int, loadSubIterK: int, return self.allocator.getVGPRTileId(tc, tileIdx, loadSubIterK) if not allocated: - # Fresh allocation return self.allocator.allocate(tc, tileIdx, loadSubIterK) - if self.config.reuseStrategy == VGPRTileReUseStrategy.WITHIN_SUBGROUP \ - and tileIdx in currentPartitionTiles: - # Tile is allocated by the current partition but will be released after it. - # Must allocate a new VGPR for the next partition's data. - # Use a shadow key to avoid overwriting the current allocation. - shadowKey = -(tileIdx + 1) # negative to avoid collision - vid = self.allocator.allocate(tc, shadowKey, loadSubIterK) - # Store the real tileIdx mapping for later fixup - self._pendingRemap.append((tc, tileIdx, loadSubIterK, shadowKey)) - return vid - - # NONE / ACROSS_SUBGROUP: tile stays alive, reuse in place + # Tile stays alive, reuse in place return None - def _isWrapAroundLoad(self, partitionIdx: int, subIterK: int, numPartitions: int) -> bool: + def _isWrapAroundLoad(self, partitionIdx: int, subtileK: int, localK: int, numPartitions: int) -> bool: """True when this step's load targets partition 0 for the next macrotile iteration.""" if self.config.prefetchMode == PrefetchMode.HALF_PREFETCH: - return partitionIdx == numPartitions - 1 and subIterK == self.numSubIterK - 1 + return (partitionIdx == numPartitions - 1 + and subtileK == self.numSubtileK - 1 + and localK == self.numSubIterK - 1) elif self.config.prefetchMode == PrefetchMode.FULL_PREFETCH: return partitionIdx == numPartitions - 1 return False # ── Prefetch modes ─────────────────────────────────────── - def _getLoadTargets(self, partitionIdx: int, subIterK: int, - numPartitions: int) -> Tuple[Optional[List[int]], Optional[List[int]], int]: - """Returns (loadATiles, loadBTiles, targetSubIterK).""" + def _getLoadTargets(self, partitionIdx: int, subtileK: int, localK: int, + numPartitions: int) -> Tuple[Optional[List[int]], Optional[List[int]], int, int]: + """Returns (loadATiles, loadBTiles, targetSId1, targetLocalK).""" if self.config.prefetchMode == PrefetchMode.HALF_PREFETCH: - return self._loadTargetsHalfPrefetch(partitionIdx, subIterK, numPartitions) + return self._loadTargetsHalfPrefetch(partitionIdx, subtileK, localK, numPartitions) elif self.config.prefetchMode == PrefetchMode.FULL_PREFETCH: - return self._loadTargetsFullPrefetch(partitionIdx, subIterK, numPartitions) - return (None, None, 0) + return self._loadTargetsFullPrefetch(partitionIdx, subtileK, localK, numPartitions) + return (None, None, 0, 0) - def _loadTargetsHalfPrefetch(self, partitionIdx, subIterK, numPartitions): - """HALF: subIterK=0 loads same-partition subIterK=1, subIterK=last loads next-partition subIterK=0. - Last partition wraps around to partition 0 (next iteration).""" + def _loadTargetsHalfPrefetch(self, partitionIdx, subtileK, localK, numPartitions): + """HALF: advance (subtileK, localK) by one step. Wraps subtileK then partition.""" currentPartition = self.partitions[partitionIdx] - if subIterK < self.numSubIterK - 1: - targetSubIterK = subIterK + 1 - return (currentPartition.tileAIndices, currentPartition.tileBIndices, targetSubIterK) + if localK < self.numSubIterK - 1: + # Next localK, same subtileK, same partition + return (currentPartition.tileAIndices, currentPartition.tileBIndices, subtileK, localK + 1) + elif subtileK < self.numSubtileK - 1: + # First localK, next subtileK, same partition + return (currentPartition.tileAIndices, currentPartition.tileBIndices, subtileK + 1, 0) else: + # First localK, first subtileK, next partition nextPartition = self.partitions[(partitionIdx + 1) % numPartitions] - return (nextPartition.tileAIndices, nextPartition.tileBIndices, 0) + return (nextPartition.tileAIndices, nextPartition.tileBIndices, 0, 0) - def _loadTargetsFullPrefetch(self, partitionIdx, subIterK, numPartitions): - """FULL: subIterK=0 loads next-partition subIterK=0, subIterK=1 loads next-partition subIterK=1. - Last partition wraps around to partition 0 (next iteration).""" + def _loadTargetsFullPrefetch(self, partitionIdx, subtileK, localK, numPartitions): + """FULL: load same (subtileK, localK) from next partition.""" nextPartition = self.partitions[(partitionIdx + 1) % numPartitions] - return (nextPartition.tileAIndices, nextPartition.tileBIndices, subIterK) + return (nextPartition.tileAIndices, nextPartition.tileBIndices, subtileK, localK) # ── Reuse strategies ───────────────────────────────────── def _releaseUnusedAfterPartition(self, partitionIdx: int): - """ACROSS_SUBGROUP: release tiles not appearing in any future partition. + """ACROSS_PARTITIONS: release tiles not appearing in any future partition. Partition 0's tiles are always considered "future" because the wrap-around LR at the end of the loop loads back into partition 0's vgprTile IDs.""" currentPartition = self.partitions[partitionIdx] @@ -709,200 +1033,212 @@ def _releaseUnusedAfterPartition(self, partitionIdx: int): if tB not in futureB: self.allocator.releaseAllForTile('B', tB) - def _releasePartitionTiles(self, partition: Partition): - """WITHIN_SUBGROUP: release all tiles (all subIterK) of this partition.""" - for tA in partition.tileAIndices: - self.allocator.releaseAllForTile('A', tA) - for tB in partition.tileBIndices: - self.allocator.releaseAllForTile('B', tB) - - def _insertWaitsAndSync(self, numPartitions: int): - """Pass 2: Insert WAIT_LR, WAIT_GR, SyncOp and reorder ops. - - After pass 1, each subIterK has: [MFMAOp, LROp, GROp?] - - This pass produces the final ordering per subIterK: - subIterK=0 (LR for MT n, GR n+2 collides): - MFMAs → LR → WAIT_LR → SyncOp → GR(n+2) - subIterK=1 (LR for MT n+1, WAIT_GR needed): - MFMAs → GR(n+2) → WAIT_GR → SyncOp → LR(n+1) → WAIT_LR - """ - # ── Pass 3 prep: build GR events for inflight counting ── - # Each entry: (opIndex, mtIteration, subtileA_set, subtileB_set) - # Also record the opIndex of each WAIT_GR candidate (subIterK==0 LR ops). + def _buildGREvents(self): + """Build GR events list from modules for inflight counting.""" grEvents = [] opIdx = 0 for pss in self.mainloopSteps: for dus in pss.subIterKSteps: - for op in dus.ops: - if isinstance(op, GROp): - grEvents.append((opIdx, op.mtIteration, - set(op.subtileA), set(op.subtileB))) + for mod in dus.modules: + if isinstance(mod.op, GROp): + grEvents.append((opIdx, mod.op.mtIteration, + set(mod.op.subtileA), set(mod.op.subtileB), + mod.op.lastForMT)) opIdx += 1 + return grEvents - def _parseMTOffset(mt: str) -> Optional[int]: - if mt == "n": - return 0 - if mt.startswith("n+"): - return int(mt[2:]) - return None + @staticmethod + def _parseMTOffset(mt: str) -> Optional[int]: + if mt == "n": + return 0 + if mt.startswith("n+"): + return int(mt[2:]) + return None - def _countInflightSubtileLoads(waitOpIndex, waitMT, waitSubtileA, waitSubtileB): - """Count inflight subtile loads by walking backwards through - the mainloop from the WAIT_GR position until we find the GR - that originally issued the waited subtiles. - - Walk upward from the WAIT_GR. Every GR encountered increments - the inflight count. When we find a GR whose (shifted) MT and - subtile sets match the WAIT target, we stop (without counting it). - - When wrapping from the start of the mainloop to the end - (previous iteration), MT iterations shift down by 1 - (e.g. n+2 becomes n+1, n+1 becomes n). - """ - waitOffset = _parseMTOffset(waitMT) - if waitOffset is None: - return 0, 0 - - targetA = set(waitSubtileA) - targetB = set(waitSubtileB) - - # Split grEvents into before-wait and after-wait (for wrap) - before = [(mt, a, b) for (idx, mt, a, b) in grEvents if idx < waitOpIndex] - after = [(mt, a, b) for (idx, mt, a, b) in grEvents if idx >= waitOpIndex] - - totalA = 0 - totalB = 0 - - # Walk backwards through events before the WAIT (no MT shift) - for (grMT, grA, grB) in reversed(before): - grOffset = _parseMTOffset(grMT) - if grOffset is None: - continue - if grOffset == waitOffset and grA == targetA and grB == targetB: - return totalA, totalB - totalA += len(grA) - totalB += len(grB) - - # Wrap: walk backwards from end of mainloop (shift MT by -1) - for (grMT, grA, grB) in reversed(after): - grOffset = _parseMTOffset(grMT) - if grOffset is None: - continue - shiftedOffset = grOffset - 1 - if shiftedOffset == waitOffset and grA == targetA and grB == targetB: - return totalA, totalB - totalA += len(grA) - totalB += len(grB) + def _countInflightSubtileLoads(self, grEvents, sikStart, sikEnd, waitMT, waitSubtileA, waitSubtileB): + """Count GR subtile loads still in flight before the current subIterK. + Excludes any loads within the current subIterK [sikStart, sikEnd). + Returns (inflightA, inflightB, scaleLoadsA, scaleLoadsB).""" + waitOffset = self._parseMTOffset(waitMT) + if waitOffset is None: + return 0, 0, 0, 0 + + targetA, targetB = set(waitSubtileA), set(waitSubtileB) + before = [(mt, a, b, first) for (idx, mt, a, b, first) in grEvents if idx < sikStart] + after = [(mt, a, b, first) for (idx, mt, a, b, first) in grEvents if idx >= sikEnd] + totalA, totalB, scaleA, scaleB = 0, 0, 0, 0 + + for (grMT, grA, grB, lastForMT) in reversed(before): + grOffset = self._parseMTOffset(grMT) + if grOffset is None: + continue + if grOffset == waitOffset and grA == targetA and grB == targetB: + return totalA, totalB, scaleA, scaleB + totalA += len(grA) + totalB += len(grB) + if lastForMT and self.hasScale: + scaleA += self.scaleLoadsPerMT_A + scaleB += self.scaleLoadsPerMT_B + + for (grMT, grA, grB, lastForMT) in reversed(after): + grOffset = self._parseMTOffset(grMT) + if grOffset is None: + continue + if grOffset - 1 == waitOffset and grA == targetA and grB == targetB: + return totalA, totalB, scaleA, scaleB + totalA += len(grA) + totalB += len(grB) + if lastForMT and self.hasScale: + scaleA += self.scaleLoadsPerMT_A + scaleB += self.scaleLoadsPerMT_B + + return totalA, totalB, scaleA, scaleB + + def _buildWaitGROp(self, lrOp, pendingA, pendingB, sikStart, sikEnd, grEvents): + """Determine if a WAIT_GR is needed before this LR. Returns (waitGROp, waitA, waitB).""" + if not lrOp or lrOp.subtileK != 0 or lrOp.subIterK != 0: + return None, set(), set() + + waitA = set(lrOp.lrLoadA.keys()) & pendingA + waitB = set(lrOp.lrLoadB.keys()) & pendingB + if not waitA and not waitB: + return None, set(), set() + + #TODO. fix _countInflightSubtileLoads . Not working well in 1x4,4x1 or multi-parition configs + inflightA, inflightB, scaleA, scaleB = self._countInflightSubtileLoads( + grEvents, sikStart, sikEnd, lrOp.mtIteration, sorted(waitA), sorted(waitB)) + waitGROp = WaitGROp( + mtIteration=lrOp.mtIteration, + subtileA=sorted(waitA), subtileB=sorted(waitB), + inflightLoadsA=inflightA, inflightLoadsB=inflightB, + inflightScaleLoadsA=scaleA, inflightScaleLoadsB=scaleB) + pendingA -= waitA + pendingB -= waitB + return waitGROp, waitA, waitB - return totalA, totalB + @staticmethod + def _findMatchingGR(lrOp, priorGRMods, grMods, waitA, waitB): + """Find the GR module that the LR should depend on (matching MT iteration).""" + targetMT = lrOp.mtIteration + for g in reversed(priorGRMods + grMods): + if g.op.mtIteration != targetMT: + continue + gA, gB = set(g.op.subtileA), set(g.op.subtileB) + if (not waitA or waitA.issubset(gA)) and (not waitB or waitB.issubset(gB)): + return g + # Fallback: any GR with matching MT + return next((g for g in reversed(priorGRMods + grMods) + if g.op.mtIteration == targetMT), None) + + @staticmethod + def _annotateGRInc(grMods): + """Append GR_INC to the last GR module for this MT iteration.""" + for grMod in grMods: + if grMod.op.lastForMT: + grMod.after.append(DepEdge(op=GR_INCOp())) - # ── Insert WAIT_LR, WAIT_GR, SyncOp, GR_INC, LR_INC and reorder ── + @staticmethod + def _annotateLRInc(lrMod, lrOp, lastLRmt): + """Prepend LR_INC if the LR's MT iteration changed.""" + if lastLRmt is not None and lrOp.mtIteration != lastLRmt: + lrMod.before.append(DepEdge(op=LR_INCOp())) + + def _annotateLRDependsOnGR(self, lrMod, lrOp, grMods, priorGRMods, waitGROp, waitA, waitB): + """LR depends on GR — GR must complete before LR can read from LDS.""" + grMatch = self._findMatchingGR(lrOp, priorGRMods, grMods, waitA, waitB) + if grMatch is not None: + lrMod.before.append(DepEdge(module=grMatch)) + lrMod.before.append(DepEdge(op=waitGROp)) + lrMod.before.append(DepEdge(op=SyncOp(comment="Barrier: wait for GR data"))) + + @staticmethod + def _annotateGRDependsOnLR(lrMod, grMods): + """GR(n+2) depends on LR — LR must complete before GR writes to LDS. Limited due to LDS double-buffering.""" + for grMod in grMods: + grMod.before.append(DepEdge(module=lrMod)) + grMod.before.append(DepEdge(op=WaitLROp())) + grMod.before.append(DepEdge(op=SyncOp(comment="Barrier: all waves done with LR before GR(n+2) writes"))) + + def _annotateDependencies(self, numPartitions: int): + """Annotate each AnnotatedModule with before/after dependency edges.""" + grEvents = self._buildGREvents() pendingA = set() pendingB = set() lastLRmt = None opIdx = 0 + priorGRMods: List[AnnotatedModule] = [] + for pss in self.mainloopSteps: gr = self.partitionGRs[pss.partitionId] + # PendingA/B are subtiles issues not been waited on yet. pendingA |= gr.subtileA pendingB |= gr.subtileB for dus in pss.subIterKSteps: - numOrigOps = len(dus.ops) - # Extract ops by type from pass 1 - mfmaOps = [op for op in dus.ops if isinstance(op, MFMAOp)] - lrOps = [op for op in dus.ops if isinstance(op, LROp)] - grOps = [op for op in dus.ops if isinstance(op, GROp)] - otherOps = [op for op in dus.ops - if not isinstance(op, (MFMAOp, LROp, GROp))] - - lrOp = lrOps[0] if lrOps else None - hasGRn2 = any(g.mtIteration == "n+2" for g in grOps) - - # Determine if WAIT_GR is needed before this LR - waitGROp = None - if lrOp and lrOp.subIterK == 0: - waitA = set(lrOp.lrLoadA.keys()) & pendingA - waitB = set(lrOp.lrLoadB.keys()) & pendingB - if waitA or waitB: - # WAIT_GR position: after all original ops in this subIterK step - waitOpIdx = opIdx + numOrigOps - inflightCountA, inflightCountB = _countInflightSubtileLoads( - waitOpIdx, lrOp.mtIteration, sorted(waitA), sorted(waitB)) - waitGROp = WaitGROp( - mtIteration=lrOp.mtIteration, - subtileA=sorted(waitA), subtileB=sorted(waitB), - inflightLoadsA=inflightCountA, inflightLoadsB=inflightCountB) - pendingA -= waitA - pendingB -= waitB - - # Rebuild ops in correct order - newOps = [] - newOps.extend(mfmaOps) - newOps.extend(otherOps) - - if waitGROp: - # subIterK=1 pattern: MFMAs → GR(n+2) → GR_INC? → WAIT_GR → SyncOp → LR_INC? → LR → WAIT_LR - newOps.extend(grOps) - if any(g.lastForMT for g in grOps): - newOps.append(GR_INCOp()) - newOps.append(waitGROp) - newOps.append(SyncOp(comment="Barrier: wait for GR data")) - if lrOp: - if lastLRmt is not None and lrOp.mtIteration != lastLRmt: - newOps.append(LR_INCOp()) - lastLRmt = lrOp.mtIteration - newOps.append(lrOp) - newOps.append(WaitLROp()) - elif hasGRn2 and lrOp: - # subIterK=0 pattern: MFMAs → LR_INC? → LR → WAIT_LR → SyncOp → GR(n+2) → GR_INC? - if lastLRmt is not None and lrOp.mtIteration != lastLRmt: - newOps.append(LR_INCOp()) + lrMods = [m for m in dus.modules if isinstance(m.op, LROp)] + grMods = [m for m in dus.modules if isinstance(m.op, GROp)] + lrMod = lrMods[0] if lrMods else None + lrOp = lrMod.op if lrMod else None + hasGRn2 = any(m.op.mtIteration == "n+2" for m in grMods) + + # build WAIT_GR is the LROp needs subtile that are still pending. + waitGROp, waitA, waitB = self._buildWaitGROp( + lrOp, pendingA, pendingB, opIdx, opIdx + len(dus.modules), grEvents) + + self._annotateGRInc(grMods) + + if waitGROp and lrMod: + self._annotateLRDependsOnGR( + lrMod, lrOp, grMods, priorGRMods, waitGROp, waitA, waitB) + elif hasGRn2 and lrMod: + self._annotateGRDependsOnLR(lrMod, grMods) + + if lrMod: + self._annotateLRInc(lrMod, lrOp, lastLRmt) + lrMod.after.append(DepEdge(op=WaitLROp())) + + if lrOp: lastLRmt = lrOp.mtIteration - newOps.append(lrOp) - newOps.append(WaitLROp()) - newOps.append(SyncOp(comment="Barrier: all waves done with LR before GR(n+2) writes")) - newOps.extend(grOps) - if any(g.lastForMT for g in grOps): - newOps.append(GR_INCOp()) - else: - # No special dependency: MFMAs → LR_INC? → LR → GR → GR_INC? → WAIT_LR - if lrOp: - if lastLRmt is not None and lrOp.mtIteration != lastLRmt: - newOps.append(LR_INCOp()) - lastLRmt = lrOp.mtIteration - newOps.append(lrOp) - newOps.extend(grOps) - if grOps and any(g.lastForMT for g in grOps): - newOps.append(GR_INCOp()) - if lrOp: - newOps.append(WaitLROp()) - - opIdx += numOrigOps - dus.ops = newOps + priorGRMods.extend(grMods) + opIdx += len(dus.modules) + + @staticmethod + def _filterDepEdges(edges: List[DepEdge], remove_types: tuple) -> List[DepEdge]: + """Filter dependency edges, removing ops of specified types.""" + return [e for e in edges if not isinstance(e.op, remove_types)] + # TODO. Re-test with multi-partitions def _buildNGLL(self) -> List[PartitionSchedule]: - """NGLL (Non Global Load Loop): mainloop without GR(n+2) and GR_INC.""" + """NGLL (No Global Load Loop): mainloop without GR(n+2) and GR_INC.""" ngll = [] for pss in self.mainloopSteps: newPss = PartitionSchedule(partitionId=pss.partitionId) for dus in pss.subIterKSteps: - newDus = SubIterKSchedule(subIterK=dus.subIterK, conflict=dus.conflict) - for op in dus.ops: - if isinstance(op, GROp) and op.mtIteration == "n+2": + newDus = SubIterKSchedule(subtileK=dus.subtileK, subIterK=dus.subIterK) + for mod in dus.modules: + if isinstance(mod.op, GROp) and mod.op.mtIteration == "n+2": continue - if isinstance(op, GR_INCOp): - continue - if isinstance(op, WaitGROp): - op = WaitGROp(mtIteration=op.mtIteration, - subtileA=op.subtileA, subtileB=op.subtileB, - inflightLoadsA=0, inflightLoadsB=0) - newDus.ops.append(op) + newBefore = [] + for e in mod.before: + if e.module and isinstance(e.module.op, GROp) and e.module.op.mtIteration == "n+2": + continue + if e.op and isinstance(e.op, WaitGROp): + # TODO. Check counts here. + newBefore.append(DepEdge(op=WaitGROp( + mtIteration=e.op.mtIteration, + subtileA=e.op.subtileA, subtileB=e.op.subtileB, + inflightLoadsA=0, inflightLoadsB=0))) + else: + newBefore.append(e) + newAfter = self._filterDepEdges(mod.after, (GR_INCOp,)) + newDus.modules.append(AnnotatedModule( + op=mod.op, before=newBefore, after=newAfter)) newPss.subIterKSteps.append(newDus) ngll.append(newPss) return ngll + # TODO. Re-test with multi-partitions def _buildNLL(self) -> List[PartitionSchedule]: """NLL (No Load Loop): mainloop without GR, GR_INC, LR_INC, LR(n+1), WaitGR(n+1) and their associated SyncOps. Keeps WaitGR(n) and its SYNC.""" @@ -910,94 +1246,87 @@ def _buildNLL(self) -> List[PartitionSchedule]: for pss in self.mainloopSteps: newPss = PartitionSchedule(partitionId=pss.partitionId) for dus in pss.subIterKSteps: - newDus = SubIterKSchedule(subIterK=dus.subIterK, conflict=dus.conflict) - ops = dus.ops - for i, op in enumerate(ops): - if isinstance(op, (GROp, GR_INCOp, LR_INCOp)): + newDus = SubIterKSchedule(subtileK=dus.subtileK, subIterK=dus.subIterK) + # Track which modules are being removed (for filtering module refs) + removedMods = set() + for mod in dus.modules: + if isinstance(mod.op, GROp): + removedMods.add(id(mod)) + elif isinstance(mod.op, LROp) and mod.op.mtIteration == "n+1": + removedMods.add(id(mod)) + + for mod in dus.modules: + if id(mod) in removedMods: continue - if isinstance(op, LROp) and op.mtIteration == "n+1": - continue - if isinstance(op, WaitGROp): - if op.mtIteration == "n+1": + # Filter deps: remove module refs to removed modules, GR_INC, LR_INC, + # WaitGR(n+1) and its paired SYNC + newBefore = [] + for e in mod.before: + if e.module and id(e.module) in removedMods: continue - op = WaitGROp(mtIteration=op.mtIteration, - subtileA=op.subtileA, subtileB=op.subtileB, - inflightLoadsA=0, inflightLoadsB=0) - if isinstance(op, SyncOp): - # Skip SyncOps associated with removed ops: - # - SyncOp followed by a GROp (barrier before GR writes) - # - SyncOp preceded by a WaitGROp(n+1) (barrier after GR wait) - nextOp = ops[i + 1] if i + 1 < len(ops) else None - prevOp = ops[i - 1] if i > 0 else None - if isinstance(nextOp, GROp): + if e.op and isinstance(e.op, (GR_INCOp, LR_INCOp)): continue - if isinstance(prevOp, WaitGROp) and prevOp.mtIteration == "n+1": + if e.op and isinstance(e.op, WaitGROp) and e.op.mtIteration == "n+1": continue - newDus.ops.append(op) - # Remove orphaned WAIT_LR when no LR remains in this subIterK - hasLR = any(isinstance(op, LROp) for op in newDus.ops) + if e.op and isinstance(e.op, SyncOp): + # Skip SYNC if paired with a removed WaitGR(n+1) + idx = mod.before.index(e) + prevEdge = mod.before[idx - 1] if idx > 0 else None + if prevEdge and prevEdge.op and isinstance(prevEdge.op, WaitGROp) and prevEdge.op.mtIteration == "n+1": + continue + if e.op and isinstance(e.op, WaitGROp): + newBefore.append(DepEdge(op=WaitGROp( + mtIteration=e.op.mtIteration, + subtileA=e.op.subtileA, subtileB=e.op.subtileB, + inflightLoadsA=0, inflightLoadsB=0))) + else: + newBefore.append(e) + newAfter = self._filterDepEdges(mod.after, (GR_INCOp,)) + newDus.modules.append(AnnotatedModule( + op=mod.op, before=newBefore, after=newAfter)) + # Remove WaitLROp from after when no LR exists in this subIterK + # (the WaitLROp was for the removed LR(n+1)) + hasLR = any(isinstance(m.op, LROp) for m in newDus.modules) if not hasLR: - newDus.ops = [op for op in newDus.ops if not isinstance(op, WaitLROp)] + for m in newDus.modules: + m.after = self._filterDepEdges(m.after, (WaitLROp,)) newPss.subIterKSteps.append(newDus) nll.append(newPss) return nll - def _checkDuplicatedReads(self): - """Detect if any (subtile, subIterK) pair is loaded more than once.""" - seenA: Dict[AllocKey, int] = {} - seenB: Dict[AllocKey, int] = {} - # Count preloop LR loads - for pss in self.preloopSteps: - for dus in pss.subIterKSteps: - for op in dus.ops: - if isinstance(op, LROp): - for tA in op.lrLoadA: - key = (tA, op.subIterK) - seenA[key] = seenA.get(key, 0) + 1 - for tB in op.lrLoadB: - key = (tB, op.subIterK) - seenB[key] = seenB.get(key, 0) + 1 - # Count mainloop LR loads (skip wrap-around which reuses existing allocations) - for pss in self.mainloopSteps: - for dus in pss.subIterKSteps: - for op in dus.ops: - if isinstance(op, LROp) and op.mtIteration != "n+1": - for tA in op.lrLoadA: - key = (tA, op.subIterK) - seenA[key] = seenA.get(key, 0) + 1 - for tB in op.lrLoadB: - key = (tB, op.subIterK) - seenB[key] = seenB.get(key, 0) + 1 - self.hasDuplicatedReads = ( - any(c > 1 for c in seenA.values()) or - any(c > 1 for c in seenB.values()) - ) # ── Debug ──────────────────────────────────────────────── @staticmethod - def _printOp(op: ScheduleOp, indent: str = ""): + def _printOp(op: ScheduleOp, indent: str = "", + showVgpr: bool = False, showSubtiles: bool = False, + scaleSet: int = 0, scaleLRSet: int = 0): if isinstance(op, MFMAOp): - print(f"{indent}MFMAs (MT {op.mtIteration}, subIterK {op.subIterK}):") - print(f"{indent} - {op.subtiles}") - print(f"{indent} - USING A: {op.vgprTileMapA} B: {op.vgprTileMapB}") - if op.scaleMapA or op.scaleMapB: - print(f"{indent} - SCALE A: {op.scaleMapA} B: {op.scaleMapB}") + scaleLabel = f" scaleSet={scaleSet}" if (op.scaleMapA or op.scaleMapB) else "" + print(f"{indent}MFMAs (MT {op.mtIteration}, subtileK {op.subtileK}, subIterK {op.subIterK}):{scaleLabel}") + if showSubtiles: + print(f"{indent} - {op.subtiles}") + if showVgpr: + print(f"{indent} - USING A: {op.vgprTileMapA} B: {op.vgprTileMapB}") elif isinstance(op, GROp): - print(f"{indent}GR (MT {op.mtIteration}): A: {op.subtileA} B: {op.subtileB}") + print(f"{indent}GR (MT {op.mtIteration}, subtileK {op.subtileK}): A: {op.subtileA} B: {op.subtileB}") elif isinstance(op, WaitGROp): - inflight = f" — inflight SubtileLoads A={op.inflightLoadsA} B={op.inflightLoadsB}" if op.inflightLoadsA is not None else "" + if op.inflightLoadsA is not None: + inflight = f" — inflight SubtileLoads A={op.inflightLoadsA} B={op.inflightLoadsB} scaleA={op.inflightScaleLoadsA} scaleB={op.inflightScaleLoadsB}" + else: + inflight = "" print(f"{indent}WAIT_GR (MT {op.mtIteration}) A: {op.subtileA} B: {op.subtileB}{inflight}") elif isinstance(op, WaitLROp): print(f"{indent}WAIT_LR") elif isinstance(op, SyncOp): print(f"{indent}SYNC") elif isinstance(op, LROp): - sikLabel = f", subIterK {op.subIterK}" if op.subIterK >= 0 else "" - scaleStr = "" - if op.lrScaleA or op.lrScaleB: - scaleStr = f" scaleA: {op.lrScaleA} scaleB: {op.lrScaleB}" - print(f"{indent}LR (MT {op.mtIteration}{sikLabel}) A: {op.lrLoadA} B: {op.lrLoadB}{scaleStr}") + sikLabel = f", subtileK {op.subtileK}, subIterK {op.subIterK}" if op.subIterK >= 0 else "" + scaleLabel = f" scaleSet={scaleLRSet}" if (op.lrScaleA or op.lrScaleB) else "" + if showVgpr: + print(f"{indent}LR (MT {op.mtIteration}{sikLabel}) A: {op.lrLoadA} B: {op.lrLoadB}{scaleLabel}") + else: + print(f"{indent}LR (MT {op.mtIteration}{sikLabel}){scaleLabel}") elif isinstance(op, SkipOp): print(f"{indent}SKIP_IF_{op.compare}({op.value}, {op.target})") elif isinstance(op, GR_INCOp): @@ -1005,14 +1334,69 @@ def _printOp(op: ScheduleOp, indent: str = ""): elif isinstance(op, LR_INCOp): print(f"{indent}LR_INC") - def printSchedule(self): - print(f"SubtileGridA={self.MTA}, SubtileGridB={self.MTB}") + @staticmethod + def _depEdgeLabel(e: DepEdge) -> str: + if e.module: + op = e.module.op + if isinstance(op, LROp): + return f"LR (MT {op.mtIteration}, subtileK {op.subtileK}, subIterK {op.subIterK})" + elif isinstance(op, GROp): + return f"GR(MT {op.mtIteration}, subtileK {op.subtileK})" + return type(op).__name__ + op = e.op + if isinstance(op, WaitGROp) and op.inflightLoadsA is not None: + return f"WaitGROp(A={op.inflightLoadsA} B={op.inflightLoadsB} SA={op.inflightScaleLoadsA} SB={op.inflightScaleLoadsB})" + return type(op).__name__ + + def _printModules(self, modules: List[AnnotatedModule], indent: str, + showVgpr: bool = False, showDeps: bool = False, + showSubtiles: bool = False, + scaleSet: int = 0, scaleLRSet: int = 0): + for mod in modules: + self._printOp(mod.op, indent=indent, showVgpr=showVgpr, + showSubtiles=showSubtiles, + scaleSet=scaleSet, scaleLRSet=scaleLRSet) + if showDeps: + before_str = ", ".join(self._depEdgeLabel(e) for e in mod.before) if mod.before else "none" + after_str = ", ".join(self._depEdgeLabel(e) for e in mod.after) if mod.after else "none" + print(f"{indent} before: [{before_str}] after: [{after_str}]") + + def _printLoopSteps(self, loopSteps: List[PartitionSchedule], indent: str, + showVgpr: bool = False, showDeps: bool = False, + showSubtiles: bool = False, + scaleSet: int = 0, scaleLRSet: int = None): + if scaleLRSet is None: + scaleLRSet = 1 - scaleSet if self.hasScale else scaleSet + for partition in loopSteps: + print(f"{indent}Partition {partition.partitionId}:") + prevSubtileK = None + for dus in partition.subIterKSteps: + if self.hasScale and prevSubtileK is not None \ + and dus.subtileK != prevSubtileK: + scaleSet, scaleLRSet = scaleLRSet, scaleSet + prevSubtileK = dus.subtileK + print(f"{indent} subtileK={dus.subtileK} subIterK={dus.subIterK}:") + self._printModules(dus.modules, indent=f"{indent} ", + showVgpr=showVgpr, showDeps=showDeps, + showSubtiles=showSubtiles, + scaleSet=scaleSet, scaleLRSet=scaleLRSet) + if self.hasScale: + scaleSet, scaleLRSet = scaleLRSet, scaleSet + + def printSchedule(self, showVgpr: bool = False, showDeps: bool = False, + showSubtiles: bool = False): + """Print the schedule. + + Args: + showVgpr: show VGPR tile assignments and scale maps. + showDeps: show before/after dependency edges on each module. + showSubtiles: show MFMA subtile coordinate lists. + """ + print(f"SubtileGridA={self.MTA}x{self.numSubtileK}, SubtileGridB={self.MTB}x{self.numSubtileK}") print(f"Partition grid: {self.numPartitionsA} x {self.numPartitionsB}") print(f"Partition size: {self.config.partitionSizeA} x {self.config.partitionSizeB}") print(f"Prefetch: {self.config.prefetchMode.name}") print(f"Reuse: {self.config.reuseStrategy.name}") - print(f"hasDuplicatedReads: {self.hasDuplicatedReads}") - print(f"needsUnrolling: {self.needsUnrolling}") print(f"totalVGPRTiles: {self.totalVGPRTiles} ({self.totalVGPRTiles * 4} VGPRs)") print(f"totalScaleVGPRTiles: {self.totalScaleVGPRTiles}") print(f"hasScale: {self.hasScale}") @@ -1030,40 +1414,25 @@ def printSchedule(self): print(" " + " ".join(f"{v:2d}" if v is not None else " " for v in row)) print() + opts = dict(showVgpr=showVgpr, showDeps=showDeps, showSubtiles=showSubtiles) + print("PRELOOP:") for pss in self.preloopSteps: for dus in pss.subIterKSteps: - for op in dus.ops: - self._printOp(op, indent=" ") + self._printModules(dus.modules, indent=" ", + showVgpr=showVgpr, showSubtiles=showSubtiles) print() print("MAINLOOP:") - for partition in self.mainloopSteps: - print(f" Partition {partition.partitionId}:") - for dus in partition.subIterKSteps: - print(f" subIterK={dus.subIterK}:") - for op in dus.ops: - self._printOp(op, indent=" ") - if dus.conflict: - print(f" *** CONFLICT: USE/LOAD share VGPRTile IDs {dus.conflict} — needs unrolling ***") + self._printLoopSteps(self.mainloopSteps, indent=" ", **opts) print() print("NGLL (No Global Load Loop):") - for partition in self.ngllSteps: - print(f" Partition {partition.partitionId}:") - for dus in partition.subIterKSteps: - print(f" subIterK={dus.subIterK}:") - for op in dus.ops: - self._printOp(op, indent=" ") + self._printLoopSteps(self.ngllSteps, indent=" ", **opts) print() print("NLL (No Load Loop):") - for partition in self.nllSteps: - print(f" Partition {partition.partitionId}:") - for dus in partition.subIterKSteps: - print(f" subIterK={dus.subIterK}:") - for op in dus.ops: - self._printOp(op, indent=" ") + self._printLoopSteps(self.nllSteps, indent=" ", **opts) # Allocate totalVGPRTiles vpgrTile def allocVgprTiles(self, writer): @@ -1120,8 +1489,8 @@ def emitMFMA(self, writer, kernel, op, dtileInfo, scaleSet=0): scaleGroupB = b // 2 scaleAVgpr = scaleTiles[op.scaleMapA[scaleGroupA]] scaleBVgpr = scaleTiles[op.scaleMapB[scaleGroupB]] - sAsel = (a % 2) + 2 * (op.subIterK % 2) - sBsel = (b % 2) + 2 * (op.subIterK % 2) + sAsel = (a % 2) + 2 * op.subIterK + sBsel = (b % 2) + 2 * op.subIterK else: scaleAVgpr = scaleBVgpr = -1 sAsel = sBsel = 0 @@ -1130,7 +1499,7 @@ def emitMFMA(self, writer, kernel, op, dtileInfo, scaleSet=0): writer, kernel, aTile, bTile, dTile, dTile, scaleAVgpr=scaleAVgpr, scaleBVgpr=scaleBVgpr, scaleAsel=sAsel, scaleBsel=sBsel, - comment=f"MFMA C[{a},{b}] += A[{a},subIterK{op.subIterK}] * B[{b},subIterK{op.subIterK}]")) + comment=f"MFMA C[{a},{b}] += A[{a},subtileK={op.subtileK},lK={op.subIterK}] * B[{b},subtileK={op.subtileK},lK={op.subIterK}]")) return module @@ -1141,21 +1510,19 @@ def emitLR(self, writer, kernel, op, scaleSet=0): module = Module() for tA, vgprTileId in op.lrLoadA.items(): dstTile = self.vgprTiles[vgprTileId] - # Using 0 for subtile ID1 for now module.add(emitSingleDsRead( - self.tileInfoA, tA, 0, op.subIterK, dstTile)) + self.tileInfoA, tA, op.subtileK, op.subIterK, dstTile)) for tB, vgprTileId in op.lrLoadB.items(): dstTile = self.vgprTiles[vgprTileId] - # Using 0 for subtile ID1 for now module.add(emitSingleDsRead( - self.tileInfoB, tB, 0, op.subIterK, dstTile)) + self.tileInfoB, tB, op.subtileK, op.subIterK, dstTile)) if op.lrScaleA: - self._emitScaleDsReads(module, writer, 'MXSA', op.lrScaleA, scaleSet=scaleSet) + self._emitScaleDsReads(module, writer, 'MXSA', op.lrScaleA, op.subtileK, scaleSet=scaleSet) if op.lrScaleB: - self._emitScaleDsReads(module, writer, 'MXSB', op.lrScaleB, scaleSet=scaleSet) + self._emitScaleDsReads(module, writer, 'MXSB', op.lrScaleB, op.subtileK, scaleSet=scaleSet) return module - def _emitScaleDsReads(self, module, writer, tc, lrScale, scaleSet=0): + def _emitScaleDsReads(self, module, writer, tc, lrScale, subtileK, scaleSet=0): """Emit DSLoadB32 for scale groups using scheduler-managed VGPRs.""" tileInfo = self.scaleTileInfoA if tc == 'MXSA' else self.scaleTileInfoB scaleTiles = self.scaleVgprTiles if scaleSet == 0 else self.scaleVgprTilesAlt @@ -1164,271 +1531,396 @@ def _emitScaleDsReads(self, module, writer, tc, lrScale, scaleSet=0): # covers 1 subtile, and a group is 2 subtiles). groupStride = 2 * tileInfo.subtileSize for scaleGroupIdx, scaleVgprTileId in lrScale.items(): - dsOffset = groupStride * scaleGroupIdx + dsOffset = groupStride * (scaleGroupIdx * self.numSubtileK + subtileK) vdst = scaleTiles[scaleVgprTileId] module.add(DSLoadB32(dst=vgpr(vdst), src=vgpr(tileInfo.sharedVgprLROffset[0]), ds=DSModifiers(offset=dsOffset), - comment="scale%s[group%u]: load 4B from LDS" % (tc, scaleGroupIdx))) + comment="scale%s[group%u,subtileK=%u]: load 4B from LDS" % (tc, scaleGroupIdx, subtileK))) - def emitWaitGR(self, inflightLoadsA, inflightLoadsB, hasScale=False): + def emitWaitGR(self, inflightLoadsA, inflightLoadsB, + inflightScaleLoadsA=0, inflightScaleLoadsB=0): """Emit SWaitCnt for GR (buffer_load) based on inflight GR counts. - WARNING: current algo won't work in all cases. TBD Args: - inflightLoadsA: Number of A GR loads still inflight. - inflightLoadsB: Number of B GR loads still inflight. - hasScale: True when MX scale DTL loads are active (they complete - at lgkmcnt/dscnt, so dscnt=0 is required after the barrier). + inflightLoadsA: Number of A subtile loads still inflight. + inflightLoadsB: Number of B subtile loads still inflight. + inflightScaleLoadsA: Number of scale A loads still inflight. + inflightScaleLoadsB: Number of scale B loads still inflight. """ module = Module() grCnt = int(inflightLoadsA / self.tileInfoA.loadRatioGR) + \ - int(inflightLoadsB / self.tileInfoB.loadRatioGR) - # Scale DTL loads (buffer_load lds=True) complete at lgkmcnt (dscnt). - # Wait for both vmcnt (data GR) and lgkmcnt (scale DTL) before barrier. - dscnt = 0 if hasScale else -1 - module.add(SWaitCnt(dscnt=dscnt, vlcnt=grCnt, vscnt=-1, - comment=f"Wait GR: A={inflightLoadsA} B={inflightLoadsB} => vlcnt={grCnt}" + - (" dscnt=0 (scale DTL)" if hasScale else ""))) + int(inflightLoadsB / self.tileInfoB.loadRatioGR) + \ + inflightScaleLoadsA + inflightScaleLoadsB + module.add(SWaitCnt(vlcnt=grCnt, vscnt=-1, + comment=f"Wait GR: A={inflightLoadsA} B={inflightLoadsB} sA={inflightScaleLoadsA} sB={inflightScaleLoadsB} => vlcnt={grCnt}")) return module def emitGR(self, writer, kernel, op): """Emit GR (Global Read) buffer_load instructions for a single GROp.""" module = Module() - # A and B data loads + # Scale DTL loads: emitted on the last GR of an MT so all scale LR from the + # current bank have completed before the DTL overwrites them (scale DTL + # covers all subtileK values in a single load). + if op.lastForMT and self.hasScale: + module.add(globalReadDoScaleSubtile('MXSA', writer, kernel)) + module.add(globalReadDoScaleSubtile('MXSB', writer, kernel)) + # A and B data loads for this GR's subtileK layer for subtileList, tileInfo in [(op.subtileA, self.tileInfoA), (op.subtileB, self.tileInfoB)]: for sId0 in subtileList: - module.add(emitSingleBufferLoad(tileInfo, kernel, sId0, 0)) - # Scale DTL loads: only on the last GR of an MT (scale covers all subtiles) - if op.lastForMT and kernel["ProblemType"].get("MXBlockA", 0) and kernel["ProblemType"].get("MXBlockB", 0): - module.add(globalReadDoScaleSubtile('MXSA', writer, kernel)) - module.add(globalReadDoScaleSubtile('MXSB', writer, kernel)) + module.add(emitSingleBufferLoad(tileInfo, kernel, sId0, op.subtileK)) return module + def _emitOp(self, writer, kernel, op, dtileInfo, scaleSet=0, scaleLRSet=0): + """Emit a single ScheduleOp into a list of instructions.""" + module = Module() + if isinstance(op, GROp): + module.add(self.emitGR(writer, kernel, op)) + elif isinstance(op, GR_INCOp): + module.add(globalReadPtrUpdates('A', writer, kernel)) + module.add(globalReadPtrUpdates('B', writer, kernel)) + module.add(globalReadLDSBufferSwap('A', writer, kernel)) + module.add(globalReadLDSBufferSwap('B', writer, kernel)) + if self.hasScale: + module.add(globalReadLDSBufferSwap('MXSA', writer, kernel)) + module.add(globalReadLDSBufferSwap('MXSB', writer, kernel)) + module.add(globalReadScalePtrUpdates('MXSA', writer, kernel)) + module.add(globalReadScalePtrUpdates('MXSB', writer, kernel)) + elif isinstance(op, MFMAOp): + module.add(self.emitMFMA(writer, kernel, op, dtileInfo, scaleSet=scaleSet)) + elif isinstance(op, WaitGROp): + module.add(self.emitWaitGR(op.inflightLoadsA, op.inflightLoadsB, + op.inflightScaleLoadsA, op.inflightScaleLoadsB)) + elif isinstance(op, WaitLROp): + module.add(SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LR to complete")) + elif isinstance(op, SyncOp): + module.add(SBarrier(comment=op.comment)) + elif isinstance(op, LR_INCOp): + module.add(localReadLDSBufferSwap('A', writer, kernel)) + module.add(localReadLDSBufferSwap('B', writer, kernel)) + if self.hasScale: + module.add(localReadLDSBufferSwap('MXSA', writer, kernel)) + module.add(localReadLDSBufferSwap('MXSB', writer, kernel)) + elif isinstance(op, LROp): + module.add(self.emitLR(writer, kernel, op, scaleSet=scaleLRSet)) + elif isinstance(op, SkipOp): + skipLabel = Label(f"SkipTo{op.target}", "") + cmpMap = {"EQ": SCmpEQU32, "LE": SCmpLeU32} + module.add(cmpMap[op.compare]( + src0=sgpr("LoopCounterL"), src1=op.value, + comment=f"LoopCounter {op.compare} {op.value}?")) + module.add(SCBranchSCC1( + labelName=skipLabel.getLabelName(), + comment=f"skip to {op.target}")) + return module.flatitems() + + @staticmethod + def _opType(op): + if isinstance(op, MFMAOp): + return "mfma" + if isinstance(op, LROp): + return "lr" + if isinstance(op, GROp): + return "gr" + if isinstance(op, WaitGROp): + return "wait_gr" + if isinstance(op, WaitLROp): + return "wait_lr" + if isinstance(op, SyncOp): + return "sync" + if isinstance(op, GR_INCOp): + return "gr_inc" + if isinstance(op, LR_INCOp): + return "lr_inc" + if isinstance(op, SkipOp): + return "skip" + return "other" + + def _buildEmittedModules(self, writer, kernel, modules, dtileInfo, scaleSet=0, scaleLRSet=0): + """Build EmittedModules with instructions + before module links.""" + emitted: List[EmittedModule] = [] + modToEmittedId: Dict[int, int] = {} + suppressAfterWaitLRForMod: Set[int] = set() + + def addEmitted(op) -> Optional[int]: + insts = self._emitOp(writer, kernel, op, dtileInfo, + scaleSet=scaleSet, scaleLRSet=scaleLRSet) + if not insts: + return None + emId = len(emitted) + emitted.append(EmittedModule(moduleId=emId, instructions=insts, opType=self._opType(op))) + return emId + + def setBefore(moduleId: int, beforeId: Optional[int]) -> None: + if beforeId is None or beforeId == moduleId: + return + curBefore = emitted[moduleId].before + if curBefore is None: + emitted[moduleId].before = beforeId + return + assert curBefore == beforeId, \ + f"EmittedModule {moduleId} has multiple before deps: {curBefore} and {beforeId}" + + # Primary modules first (MFMA/LR/GR) + for mod in modules: + emId = addEmitted(mod.op) + if emId is not None: + modToEmittedId[id(mod)] = emId + + # If another module has before=[module-ref-to-X, WaitLROp, ...], + # suppress standalone X.after WaitLROp emission to avoid duplicates. + for mod in modules: + hasWaitLRInBefore = any(isinstance(e.op, WaitLROp) for e in mod.before) + if not hasWaitLRInBefore: + continue + for e in mod.before: + if e.module is not None: + suppressAfterWaitLRForMod.add(id(e.module)) + + # Dependency-op links for emitted debug/scheduling. + for mod in modules: + curId = modToEmittedId.get(id(mod)) + if curId is None: + continue + + # before ops: chain from module refs / deps, then before points to + # the last non-standalone dep. + prevId: Optional[int] = None + lastDepId: Optional[int] = None + for edge in mod.before: + if edge.module: + prevId = modToEmittedId.get(id(edge.module), prevId) + continue + if edge.op is None: + continue + depId = addEmitted(edge.op) + if depId is None: + continue + if isinstance(edge.op, WaitGROp): + # Keep WAIT_GR standalone (no links), but allow later deps to + # chain from it. + prevId = depId + continue + setBefore(depId, prevId) + prevId = depId + lastDepId = depId + if lastDepId is not None: + setBefore(curId, lastDepId) + elif prevId is not None: + # before had only module refs and/or standalone deps + setBefore(curId, prevId) + + # after ops: append deps as standalone modules and chain them via + # before links so before-only path extraction can follow them. + depIds: List[int] = [] + for edge in mod.after: + if edge.module: + mId = modToEmittedId.get(id(edge.module)) + if mId is not None: + depIds.append(mId) + continue + if edge.op is None: + continue + if isinstance(edge.op, WaitLROp) and id(mod) in suppressAfterWaitLRForMod: + continue + depId = addEmitted(edge.op) + if depId is None: + continue + depIds.append(depId) + prevAfterId = curId + for depId in depIds: + setBefore(depId, prevAfterId) + prevAfterId = depId + + return emitted + def _emitSubIterK(self, writer, kernel, pss, dus, scaleSet=0, scaleLRSet=0): """Emit a single subIterK step into a Module. scaleSet: which scale VGPR set MFMA reads from. - scaleLRSet: which scale VGPR set LR writes to.""" + scaleLRSet: which scale VGPR set LR writes to. + + If modules contain MFMAs, emits via instructionSchedule for + dependency-aware interleaving. Otherwise emits sequentially. + """ dtileInfo = writer.states.d.tileInfo module = Module() - module.addComment0(f"Partition {pss.partitionId}: subIterK={dus.subIterK}") - hasScale = (kernel["ProblemType"].get("MXBlockA", 0) and - kernel["ProblemType"].get("MXBlockB", 0)) - for op in dus.ops: - if isinstance(op, GROp): - module.add(self.emitGR(writer, kernel, op)) - elif isinstance(op, GR_INCOp): - module.add(globalReadPtrUpdates('A', writer, kernel)) - module.add(globalReadPtrUpdates('B', writer, kernel)) - module.add(globalReadLDSBufferSwap('A', writer, kernel)) - module.add(globalReadLDSBufferSwap('B', writer, kernel)) - if hasScale: - module.add(globalReadLDSBufferSwap('MXSA', writer, kernel)) - module.add(globalReadLDSBufferSwap('MXSB', writer, kernel)) - module.add(globalReadScalePtrUpdates('MXSA', writer, kernel)) - module.add(globalReadScalePtrUpdates('MXSB', writer, kernel)) - elif isinstance(op, MFMAOp): - module.add(self.emitMFMA(writer, kernel, op, dtileInfo, scaleSet=scaleSet)) - elif isinstance(op, WaitGROp): - module.add(self.emitWaitGR(op.inflightLoadsA, op.inflightLoadsB, hasScale)) - elif isinstance(op, WaitLROp): - module.add(SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LR to complete")) - elif isinstance(op, SyncOp): - module.add(SBarrier(comment=op.comment)) - elif isinstance(op, LR_INCOp): - module.add(localReadLDSBufferSwap('A', writer, kernel)) - module.add(localReadLDSBufferSwap('B', writer, kernel)) - if hasScale: - module.add(localReadLDSBufferSwap('MXSA', writer, kernel)) - module.add(localReadLDSBufferSwap('MXSB', writer, kernel)) - elif isinstance(op, LROp): - module.add(self.emitLR(writer, kernel, op, scaleSet=scaleLRSet)) - elif isinstance(op, SkipOp): - skipLabel = Label(f"SkipTo{op.target}", "") - cmpMap = {"EQ": SCmpEQU32, "LE": SCmpLeU32} - module.add(cmpMap[op.compare]( - src0=sgpr("LoopCounterL"), src1=op.value, - comment=f"LoopCounter {op.compare} {op.value}?")) - module.add(SCBranchSCC1( - labelName=skipLabel.getLabelName(), - comment=f"skip to {op.target}")) + module.addComment0(f"Partition {pss.partitionId}: subtileK={dus.subtileK} subIterK={dus.subIterK}") + + hasMFMA = any(isinstance(m.op, MFMAOp) for m in dus.modules) + if hasMFMA: + emitted = self._buildEmittedModules(writer, kernel, dus.modules, dtileInfo, + scaleSet=scaleSet, scaleLRSet=scaleLRSet) + merged = self.instructionSchedule(emitted) + module.add(merged) + else: + # Special case for preloop (MFMA free) + for m in dus.modules: + for inst in self._emitOp(writer, kernel, m.op, dtileInfo, + scaleSet=scaleSet, scaleLRSet=scaleLRSet): + module.add(inst) return module @staticmethod - def instructionSchedule(module): - """Schedule MFMAs among other instructions within a subIterK module. - - Rules (invariants preserved by this pass): - - MFMA instruction order is preserved - - Non-MFMA instruction order is preserved - - Insert 1 MFMA between each LR (ds_read) instruction - - Insert 3 MFMAs between the last LR and the WAIT_LR (SWaitCnt dscnt) - - Insert 1 MFMA between WAIT_LR and SYNC (SBarrier) - - No MFMAs between an m0 update and its buffer_load (they are a pair) - - Remaining MFMAs are spread evenly between buffer_load pairs + def _extractPathsFromBeforeDeps(emittedModules: List['EmittedModule']) -> Tuple[int, List[List[int]]]: + """Extract non-MFMA dependency paths using only EmittedModule.before links. + + Returns: + (mfmaIdx, paths) + - mfmaIdx: index of the MFMA emitted module in emittedModules + - paths: list of non-MFMA module-index paths """ - #return module - items = module.flatitems() - if not items: - return module + idToIdx = {em.moduleId: i for i, em in enumerate(emittedModules)} + n = len(emittedModules) + + mfmaModuleIds = [i for i, em in enumerate(emittedModules) if em.opType == "mfma"] + assert len(mfmaModuleIds) == 1, "_extractPathsFromBeforeDeps expects exactly one MFMA emitted module" + mfmaIdx = mfmaModuleIds[0] + nonMfmaIds = [i for i in range(n) if i != mfmaIdx] + nonMfmaSet = set(nonMfmaIds) + + # Each non-MFMA module has at most one predecessor, and each predecessor + # has at most one child, so paths are simple chains. + pred: List[int] = [-1 for _ in range(n)] + child: List[int] = [-1 for _ in range(n)] + for i in nonMfmaIds: + parent = -1 + b = emittedModules[i].before + if b is not None: + bi = idToIdx.get(b) + if bi is not None and bi != i and bi in nonMfmaSet: + parent = bi + pred[i] = parent + if parent != -1: + assert child[parent] == -1, \ + f"_extractPathsFromBeforeDeps expects unique child per predecessor, got {child[parent]} and {i} for {parent}" + child[parent] = i + + def _findHead(mid: int) -> int: + cur = mid + seen = [False for _ in range(n)] + while pred[cur] != -1 and not seen[cur]: + seen[cur] = True + cur = pred[cur] + return cur + + def _walkFromHead(head: int, used: List[bool]) -> List[int]: + order: List[int] = [] + localSeen = [False for _ in range(n)] + cur = head + while cur != -1 and not used[cur] and not localSeen[cur]: + order.append(cur) + localSeen[cur] = True + cur = child[cur] + return order + + used = [False for _ in range(n)] + paths: List[List[int]] = [] + for mid in nonMfmaIds: + if used[mid]: + continue + head = _findHead(mid) + order = _walkFromHead(head, used) + assert order, f"_extractPathsFromBeforeDeps produced empty path for module {mid}" + for i in order: + used[i] = True + paths.append(order) + + return mfmaIdx, paths - isMFMA = lambda x: isinstance(x, (MFMAInstruction, MXMFMAInstruction)) - mfmas = [x for x in items if isMFMA(x)] - others = [x for x in items if not isMFMA(x)] - - if not mfmas or not others: - return module - - # Group others into slots: each slot is a list of instructions that - # must stay together (e.g. m0 update + buffer_load pair). - # We'll insert MFMAs BETWEEN slots. - slots = [] - i = 0 - while i < len(others): - inst = others[i] - # Pair m0 update with its following buffer_load - if i + 1 < len(others) and isinstance(others[i + 1], GlobalReadInstruction): - slots.append(others[i:i+2]) - i += 2 - else: - slots.append([inst]) - i += 1 - - # Classify each slot for MFMA insertion rules - LR_SLOT = 0 # ds_read (LocalReadInstruction) - WAITLR_SLOT = 1 # SWaitCnt with dscnt (WAIT_LR) - SYNC_SLOT = 2 # SBarrier (SYNC) - GR_SLOT = 3 # m0 + buffer_load pair or standalone buffer_load - OTHER_SLOT = 4 # everything else - - def classify(slot): - first = slot[0] - # DSLoadB32 = scale LR (ds_read_b32): with double-buffered scale VGPRs, - # MFMA reads from one set while ds_read writes to another, so interleaving is safe. - if isinstance(first, DSLoadB32): - return LR_SLOT - if isinstance(first, LocalReadInstruction): - return LR_SLOT - if isinstance(first, SWaitCnt): - return WAITLR_SLOT - if isinstance(first, SBarrier): - return SYNC_SLOT - if isinstance(first, GlobalReadInstruction) or \ - (len(slot) > 1 and isinstance(slot[-1], GlobalReadInstruction)): - return GR_SLOT - return OTHER_SLOT - - slotTypes = [classify(s) for s in slots] - - # Build MFMA budget: how many MFMAs to insert AFTER each slot. - # (mfmasAfter[i] = number of MFMAs inserted after slots[i]) - numSlots = len(slots) - mfmasAfter = [0] * numSlots - - mi = 0 # next MFMA to assign - - # Pass 1: assign fixed MFMAs per rules - for si in range(numSlots): - if mi >= len(mfmas): - break - st = slotTypes[si] - nextSt = slotTypes[si + 1] if si + 1 < numSlots else None - - if st == LR_SLOT and nextSt == LR_SLOT: - # 1 MFMA between each LR - mfmasAfter[si] = min(1, len(mfmas) - mi) - mi += mfmasAfter[si] - elif st == LR_SLOT and nextSt == WAITLR_SLOT: - # 3 MFMAs between last LR and WAIT_LR - mfmasAfter[si] = min(3, len(mfmas) - mi) - mi += mfmasAfter[si] - elif st == LR_SLOT and nextSt != LR_SLOT and nextSt != WAITLR_SLOT: - # Last LR but no WAIT_LR follows — still insert 1 - mfmasAfter[si] = min(1, len(mfmas) - mi) - mi += mfmasAfter[si] - elif st == WAITLR_SLOT and nextSt == SYNC_SLOT: - # 1 MFMA between WAIT_LR and SYNC - mfmasAfter[si] = min(1, len(mfmas) - mi) - mi += mfmasAfter[si] - - # Pass 2: spread remaining MFMAs evenly between GR slots - grIndices = [si for si in range(numSlots) if slotTypes[si] == GR_SLOT] - remaining = len(mfmas) - mi - if remaining > 0 and grIndices: - base = remaining // len(grIndices) - extra = remaining % len(grIndices) - for gi, si in enumerate(grIndices): - count = base + (1 if gi < extra else 0) - mfmasAfter[si] += count - mi += count - - # Assemble final output - result = Module() - mfmaIdx = 0 - - # Leading MFMAs: any unassigned MFMAs go before the first slot - leadingMfmas = len(mfmas) - mi - for _ in range(leadingMfmas): - result.add(mfmas[mfmaIdx]) - mfmaIdx += 1 - - for si in range(numSlots): - for inst in slots[si]: - result.add(inst) - for _ in range(mfmasAfter[si]): - if mfmaIdx < len(mfmas): - result.add(mfmas[mfmaIdx]) - mfmaIdx += 1 - - # Any remaining MFMAs at the end - while mfmaIdx < len(mfmas): - result.add(mfmas[mfmaIdx]) - mfmaIdx += 1 + @staticmethod + def instructionSchedule(emittedModules: List['EmittedModule']): + """Interleave non-MFMA instructions between MFMAs using 2 slots/interval. + + Rules: + - MFMA order is preserved. + - Between two adjacent MFMAs there are 2 placement slots. + - At most one ds_read (LocalReadInstruction) per interval. + - Before dependencies are respected at module order level. + - Minimm distance between ds_read and it waitcnt (hardcoded for now) + - Module-internal instruction order is preserved. + - LR path containing a WAIT_GR is packed from the end backwards. We want WAIT_GR to be done as late as possible. + - GR path is spread as much as possible across remaining valid slots. No backwards here as we want GRs to be done as early as possible. + + TODO : To be tested on multi-partition setup. + """ + if not emittedModules: + return Module() - return result + isMFMA = lambda x: isinstance(x, (MFMAInstruction, MXMFMAInstruction)) + n = len(emittedModules) + + mfmaIdx, pathOrders = SubtileBasedScheduler._extractPathsFromBeforeDeps(emittedModules) + mfmas = [x for x in emittedModules[mfmaIdx].instructions if isMFMA(x)] + + # Single MFMA: no slots to interleave into — emit MFMA then all paths. + if len(mfmas) < 2: + result = Module() + for m in mfmas: + result.add(m) + for order in pathOrders: + for mid in order: + for inst in emittedModules[mid].instructions: + result.add(inst) + return result + + paths = _classifyPaths(pathOrders, emittedModules) + rules = _SchedulingRules(totalSlots=(len(mfmas) - 1) * 2) + placer = _SlotPlacer( + len(mfmas) - 1, n, pathOrders, + validators=[rules.oneDsReadPerInterval, rules.minGapDsReadBeforeWait, rules.minGapDsReadToWait, rules.noM0WithBufferLoad], + adjusters=[rules.spreadBufferLoads], + onPlace=rules.trackPlacement) + + for order, hasWaitGR in paths: + if not order: + continue + pathInsts = _flattenPath(order, emittedModules, reverse=hasWaitGR) + rules.resetPath() + if not hasWaitGR: + rules.setupBufLoadSpreading(placer, pathInsts, order) + placer.placePath(pathInsts, reverse=hasWaitGR) + + scheduled = placer.assemble(mfmas) + + # Post-pass: adjust vmcnt of any SWaitCnt to account for buffer_loads + # that the scheduler placed before it within this subIterK. + bufLoadCount = 0 + for inst in scheduled.items(): + if _isBufferLoad(inst): + bufLoadCount += 1 + elif _isWaitCnt(inst) and inst.vlcnt >= 0: + inst.vlcnt += bufLoadCount + + return scheduled def _emitLoop(self, writer, kernel, label, steps, scaleSet=0, scaleLRSet=None): - """Emit a loop module (mainloop, NGLL, or NLL). - - Emits each subIterK step as a separate module, applies instruction - interleaving, then combines into the final loop module. - All waits (WAIT_LR, WAIT_GR, SyncOp) are explicit schedule ops. + """Emit a loop section (preloop, mainloop, NGLL, or NLL). scaleSet: which scale VGPR set MFMA reads from (starting set for first partition). scaleLRSet: which scale VGPR set LR writes to (defaults to 1-scaleSet if None). Both rotate per partition so each partition's MFMA reads the scales that the previous partition's LR loaded. + Additionally, when numSubtileK > 1, scaleSet/scaleLRSet swap at + subtileK boundaries so each subtileK's MFMAs read the scales that + the previous subtileK's LR loaded into the alternate set. """ if scaleLRSet is None: scaleLRSet = 1 - scaleSet if self.hasScale else scaleSet module = Module(label) module.addComment0(f"{label} start") for pss in steps: + prevSubtileK = None for dus in pss.subIterKSteps: + if self.hasScale and prevSubtileK is not None \ + and dus.subtileK != prevSubtileK: + scaleSet, scaleLRSet = scaleLRSet, scaleSet + prevSubtileK = dus.subtileK subModule = self._emitSubIterK(writer, kernel, pss, dus, scaleSet=scaleSet, scaleLRSet=scaleLRSet) - subModule = self.instructionSchedule(subModule) module.add(subModule) if self.hasScale: scaleSet, scaleLRSet = scaleLRSet, scaleSet + module.addComment0(f"{label} end") return module - def generateCode(self, writer, kernel): - self.allocVgprTiles(writer) - - preloop = self._emitLoop(writer, kernel, "PRELOOP", self.preloopSteps) - mainloop = self._emitLoop(writer, kernel, "MAINLOOP", self.mainloopSteps) - - ngll = Module("NGLL") - ngll.add(Label("SkipToNGLL", "")) - ngll.add(self._emitLoop(writer, kernel, "NGLL", self.ngllSteps)) - - nll = Module("NLL") - nll.add(Label("SkipToNLL", "")) - nll.add(self._emitLoop(writer, kernel, "NLL", self.nllSteps)) - - for label, module in [("PRELOOP", preloop), ("MAINLOOP", mainloop), - ("NGLL", ngll), ("NLL", nll)]: - print(f"\n{label}:") - print(module) diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/subtile_bf16.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/subtile_bf16.yaml index c499051c6e3..07840a8ec8b 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/subtile_bf16.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/subtile_bf16.yaml @@ -85,7 +85,7 @@ BenchmarkProblems: - [16, 16, 32, 1, 1, 2, 3, 2, 2] # MT 64x96 (2x3 wavetile) - [16, 16, 32, 1, 1, 2, 9, 4, 1] # MT 128x144 (very asymmetric N) - [16, 16, 32, 1, 1, 9, 2, 1, 4] # MT 144x128 (very asymmetric M) - - PrefetchGlobalRead: [0] + - PrefetchGlobalRead: [0, 2] - PrefetchLocalRead: [0] - DepthU: [64] # 2*MI_K = 2*32 - ScheduleIterAlg: [3] @@ -142,7 +142,7 @@ BenchmarkProblems: - [16, 16, 32, 1, 1, 8, 8, 2, 2] # MT 256x256 - [16, 16, 32, 1, 1, 6, 4, 2, 2] # MT 192x128 (asymmetric) - [16, 16, 32, 1, 1, 2, 6, 4, 1] # MT 128x96 (4x1 wave group) - - PrefetchGlobalRead: [0] + - PrefetchGlobalRead: [0, 2] - PrefetchLocalRead: [0] - DepthU: [128, 192] # > 2*MI_K; K must be mult of LCM(128,192)=384 - ScheduleIterAlg: [3] @@ -193,7 +193,7 @@ BenchmarkProblems: - [16, 16, 32, 1, 1, 3, 4, 2, 2] # MT 96x128 (odd M wavetile) - [16, 16, 32, 1, 1, 6, 4, 2, 2] # MT 192x128 (asymmetric) - [16, 16, 32, 1, 1, 2, 6, 4, 1] # MT 128x96 (4x1 WG) - - PrefetchGlobalRead: [0] + - PrefetchGlobalRead: [0, 2] - PrefetchLocalRead: [0] - DepthU: [64] # 2*MI_K = 2*32 - ScheduleIterAlg: [3] @@ -246,7 +246,7 @@ BenchmarkProblems: - MatrixInstruction: - [16, 16, 32, 1, 1, 4, 4, 2, 2] # MT 128x128 - [16, 16, 32, 1, 1, 8, 8, 2, 2] # MT 256x256 - - PrefetchGlobalRead: [0] + - PrefetchGlobalRead: [0, 2] - PrefetchLocalRead: [0] - DepthU: [64] - ScheduleIterAlg: [3] diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/subtile_mxfp4.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/subtile_mxfp4.yaml index f7e82632511..0cb14158291 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/subtile_mxfp4.yaml +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx950/subtile_mxfp4.yaml @@ -77,7 +77,7 @@ BenchmarkProblems: # 1x4 wave group - [16, 16, 128, 1, 1, 2, 6, 1, 4] # MT 32x384 - [16, 16, 128, 1, 1, 6, 4, 1, 4] # MT 96x256 - - PrefetchGlobalRead: [0] + - PrefetchGlobalRead: [0, 2] - PrefetchLocalRead: [0] - DepthU: [256] # 2*MI_K = 2*128 - ScheduleIterAlg: [3] @@ -149,8 +149,8 @@ BenchmarkProblems: - [16, 16, 128, 1, 1, 8, 8, 2, 2] # MT 256x256 - [16, 16, 128, 1, 1, 6, 4, 2, 2] # MT 192x128 (asymmetric) - [16, 16, 128, 1, 1, 2, 6, 4, 1] # MT 128x96 (4x1 wave group) - - PrefetchGlobalRead: [0] - - PrefetchLocalRead: [0] + - PrefetchGlobalRead: [0, 2] + - PrefetchLocalRead: [1] - DepthU: [512] # 4*MI_K; K must be mult of 512 - ScheduleIterAlg: [3] - DirectToLds: [1] @@ -210,7 +210,7 @@ BenchmarkProblems: - [16, 16, 128, 1, 1, 8, 8, 2, 2] # MT 256x256 - [16, 16, 128, 1, 1, 6, 4, 2, 2] # MT 192x128 (asymmetric) - [16, 16, 128, 1, 1, 2, 6, 4, 1] # MT 128x96 (4x1 WG) - - PrefetchGlobalRead: [0] + - PrefetchGlobalRead: [0, 2] - PrefetchLocalRead: [0] - DepthU: [256] # 2*MI_K = 2*128 - ScheduleIterAlg: [3] @@ -271,7 +271,7 @@ BenchmarkProblems: - MatrixInstruction: - [16, 16, 128, 1, 1, 4, 4, 2, 2] # MT 128x128 - [16, 16, 128, 1, 1, 8, 8, 2, 2] # MT 256x256 - - PrefetchGlobalRead: [0] + - PrefetchGlobalRead: [0, 2] - PrefetchLocalRead: [0] - DepthU: [256] - ScheduleIterAlg: [3] @@ -329,7 +329,7 @@ BenchmarkProblems: - [16, 16, 128, 1, 1, 4, 6, 2, 2] # MT 128x192 - [16, 16, 128, 1, 1, 6, 8, 2, 2] # MT 192x256 - [16, 16, 128, 1, 1, 8, 6, 2, 2] # MT 256x192 - - PrefetchGlobalRead: [2] + - PrefetchGlobalRead: [0, 2] - PrefetchLocalRead: [1] - DepthU: [256] - ScheduleIterAlg: [3] @@ -400,7 +400,7 @@ BenchmarkProblems: - [16, 16, 128, 1, 1, 8, 2, 1, 4] # MT 128x128 - [16, 16, 128, 1, 1, 4, 4, 1, 4] # MT 64x256 - [16, 16, 128, 1, 1, 8, 4, 1, 4] # MT 128x256 - - PrefetchGlobalRead: [0] + - PrefetchGlobalRead: [0, 2] - PrefetchLocalRead: [0] - DepthU: [256] - ScheduleIterAlg: [3] diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/unit/test_SubtileBasedScheduler.py b/projects/hipblaslt/tensilelite/Tensile/Tests/unit/test_SubtileBasedScheduler.py index cc2329c0e73..aa3d89460ac 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tests/unit/test_SubtileBasedScheduler.py +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/unit/test_SubtileBasedScheduler.py @@ -3,7 +3,11 @@ from types import SimpleNamespace from unittest.mock import MagicMock from Tensile.Components.SubtileBasedKernel import TileInfo -from Tensile.Components.SubtileBasedScheduler import SubtileBasedScheduler, SchedulerConfig, PrefetchMode, VGPRTileReUseStrategy, SubgroupOrdering +from Tensile.Components.SubtileBasedScheduler import ( + SubtileBasedScheduler, SchedulerConfig, PrefetchMode, + MFMAOp, GROp, LROp, WaitGROp, WaitLROp, SyncOp, GR_INCOp, LR_INCOp, +) +from rocisa.code import Module, Label from rocisa import rocIsa from rocisa.register import RegisterPool from rocisa.enum import RegisterType @@ -21,30 +25,46 @@ def _mock_dtype(num_bytes=2): mock.numBytes.return_value = num_bytes return mock -def create_kernel(MT0=256, MT1=256): - dtype = _mock_dtype(2) +def create_kernel(MT0=256, MT1=256, fp4=False, depthU=None): + mxblock = 32 if fp4 else 0 + bpe = 0.5 if fp4 else 2 + matrixInstK = 128 if fp4 else 32 + if depthU is None: + depthU = 256 if fp4 else 64 + dtype = _mock_dtype(bpe) problemType = { "DataTypeA": dtype, "DataTypeB": dtype, "ComputeDataType": _mock_dtype(4), } - return { - "DepthU": 64, - "_DepthUA": 64, - "_DepthUB": 64, + if fp4: + problemType["MXBlockA"] = mxblock + problemType["MXBlockB"] = mxblock + kernel = { + "DepthU": depthU, + "_DepthUA": depthU, + "_DepthUB": depthU, "MacroTileA": MT0, "MacroTileB": MT1, "MacroTile0": MT0, "MacroTile1": MT1, "MatrixInstM": 16, "MatrixInstN": 16, - "MatrixInstK": 32, + "MatrixInstK": matrixInstK, "MIWaveGroup": [2, 2], "WavefrontSize": 64, "SourceSwap": False, "MIArchVgpr": False, + "NonTemporalA": 0, + "NonTemporalB": 0, + "NonTemporalMXSA": 0, + "NonTemporalMXSB": 0, "ProblemType": problemType, } + if fp4: + kernel["_DepthUMXSA"] = depthU // mxblock + kernel["_DepthUMXSB"] = depthU // mxblock + return kernel def create_mock_writer(kernel): writer = SimpleNamespace() @@ -60,12 +80,18 @@ def create_mock_writer(kernel): writer.states.d = SimpleNamespace(tileInfo=dTileInfo) return writer -def create_writer_with_tiles(kernel, tiA, tiB): +def create_writer_with_tiles(kernel, tiA, tiB, scaleTiA=None, scaleTiB=None): writer = create_mock_writer(kernel) writer.states.a = SimpleNamespace(tileInfo=tiA) writer.states.b = SimpleNamespace(tileInfo=tiB) + writer.states.mxsa = SimpleNamespace(tileInfo=scaleTiA) if scaleTiA else SimpleNamespace() + writer.states.mxsb = SimpleNamespace(tileInfo=scaleTiB) if scaleTiB else SimpleNamespace() tiA.allocOffsetRegisters(writer, kernel) tiB.allocOffsetRegisters(writer, kernel) + if scaleTiA: + scaleTiA.allocOffsetRegisters(writer, kernel) + if scaleTiB: + scaleTiB.allocOffsetRegisters(writer, kernel) return writer @@ -74,13 +100,12 @@ def test_PGR2_64_64_1x1(): kernel = create_kernel(MT0,MT1) tiA = TileInfo('A', kernel) tiB = TileInfo('B', kernel) + # 2x2 partition grid lsgA = tiA.localSubtileGrid[0] lsgB = tiB.localSubtileGrid[0] - cfg = SchedulerConfig(lsgA, lsgB, PrefetchMode.HALF_PREFETCH, - VGPRTileReUseStrategy.ACROSS_SUBGROUP, - SubgroupOrdering.COLUMN_MAJOR) + cfg = SchedulerConfig(lsgA, lsgB, PrefetchMode.HALF_PREFETCH) s = SubtileBasedScheduler(tiA, tiB, cfg) assert len(s.preloopSteps) > 0 @@ -92,17 +117,15 @@ def test_PGR2_64_64_1x1(): buf = io.StringIO() with contextlib.redirect_stdout(buf): - s.printSchedule() + s.printSchedule(showVgpr=True, showDeps=True, showSubtiles=True) actual = buf.getvalue() expected = """\ -SubtileGridA=2, SubtileGridB=2 +SubtileGridA=2x1, SubtileGridB=2x1 Partition grid: 1 x 1 Partition size: 2 x 2 Prefetch: HALF_PREFETCH -Reuse: ACROSS_SUBGROUP -hasDuplicatedReads: False -needsUnrolling: False +Reuse: ACROSS_PARTITIONS totalVGPRTiles: 8 (32 VGPRs) totalScaleVGPRTiles: 0 hasScale: False @@ -111,74 +134,87 @@ def test_PGR2_64_64_1x1(): 0 PRELOOP: - GR (MT 0): A: [0, 1] B: [0, 1] + GR (MT 0, subtileK 0): A: [0, 1] B: [0, 1] GR_INC - WAIT_GR (MT 0) A: [0, 1] B: [0, 1] — inflight SubtileLoads A=0 B=0 + WAIT_GR (MT 0) A: [0, 1] B: [0, 1] — inflight SubtileLoads A=0 B=0 scaleA=0 scaleB=0 SYNC - LR (MT 0, subIterK 0) A: {0: 0, 1: 1} B: {0: 2, 1: 3} + LR (MT 0, subtileK 0, subIterK 0) A: {0: 0, 1: 1} B: {0: 2, 1: 3} WAIT_LR SKIP_IF_LE(1, NLL) - GR (MT 1): A: [0, 1] B: [0, 1] + GR (MT 1, subtileK 0): A: [0, 1] B: [0, 1] GR_INC SKIP_IF_LE(2, NGLL) MAINLOOP: Partition 0: - subIterK=0: - MFMAs (MT n, subIterK 0): + subtileK=0 subIterK=0: + MFMAs (MT n, subtileK 0, subIterK 0): - [(0, 0), (0, 1), (1, 0), (1, 1)] - USING A: {0: 0, 1: 1} B: {0: 2, 1: 3} - LR (MT n, subIterK 1) A: {0: 4, 1: 5} B: {0: 6, 1: 7} - WAIT_LR - SYNC - GR (MT n+2): A: [0] B: [0] - subIterK=1: - MFMAs (MT n, subIterK 1): + before: [none] after: [none] + LR (MT n, subtileK 0, subIterK 1) A: {0: 4, 1: 5} B: {0: 6, 1: 7} + before: [none] after: [WaitLROp] + GR (MT n+2, subtileK 0): A: [0] B: [0] + before: [LR (MT n, subtileK 0, subIterK 1), WaitLROp, SyncOp] after: [none] + subtileK=0 subIterK=1: + MFMAs (MT n, subtileK 0, subIterK 1): - [(0, 0), (0, 1), (1, 0), (1, 1)] - USING A: {0: 4, 1: 5} B: {0: 6, 1: 7} - GR (MT n+2): A: [1] B: [1] - GR_INC - WAIT_GR (MT n+1) A: [0, 1] B: [0, 1] — inflight SubtileLoads A=2 B=2 - SYNC - LR_INC - LR (MT n+1, subIterK 0) A: {0: 0, 1: 1} B: {0: 2, 1: 3} - WAIT_LR - -NGLL (No Global Load Loop): - Partition 0: - subIterK=0: - MFMAs (MT n, subIterK 0): - - [(0, 0), (0, 1), (1, 0), (1, 1)] - - USING A: {0: 0, 1: 1} B: {0: 2, 1: 3} - LR (MT n, subIterK 1) A: {0: 4, 1: 5} B: {0: 6, 1: 7} - WAIT_LR - SYNC - subIterK=1: - MFMAs (MT n, subIterK 1): - - [(0, 0), (0, 1), (1, 0), (1, 1)] - - USING A: {0: 4, 1: 5} B: {0: 6, 1: 7} - WAIT_GR (MT n+1) A: [0, 1] B: [0, 1] — inflight SubtileLoads A=0 B=0 - SYNC - LR_INC - LR (MT n+1, subIterK 0) A: {0: 0, 1: 1} B: {0: 2, 1: 3} - WAIT_LR + before: [none] after: [none] + LR (MT n+1, subtileK 0, subIterK 0) A: {0: 0, 1: 1} B: {0: 2, 1: 3} + before: [WaitGROp(A=1 B=1 SA=0 SB=0), SyncOp, LR_INCOp] after: [WaitLROp] + GR (MT n+2, subtileK 0): A: [1] B: [1] + before: [none] after: [GR_INCOp] +""" + + assert expected in actual + + +def test_PGR2_64_64_1x1_fp4(): + kernel = create_kernel(64, 64, fp4=True) + tiA = TileInfo('A', kernel) + tiB = TileInfo('B', kernel) + scaleTiA = TileInfo('MXSA', kernel) + scaleTiB = TileInfo('MXSB', kernel) + lsgA = tiA.localSubtileGrid[0] + lsgB = tiB.localSubtileGrid[0] + + cfg = SchedulerConfig(lsgA, lsgB, PrefetchMode.HALF_PREFETCH) + s = SubtileBasedScheduler(tiA, tiB, cfg, + scaleTileInfoA=scaleTiA, scaleTileInfoB=scaleTiB) -NLL (No Load Loop): + # HALF_PREFETCH: only 2 flatK alive at a time → 2 × 2 subtiles × 2 (A+B) = 8 + assert s.totalVGPRTiles == 8 + assert s.totalScaleVGPRTiles == 2 + assert s.hasScale + + buf = io.StringIO() + with contextlib.redirect_stdout(buf): + s.printSchedule(showVgpr=True, showDeps=True) + actual = buf.getvalue() + + expected = """\ +MAINLOOP: Partition 0: - subIterK=0: - MFMAs (MT n, subIterK 0): - - [(0, 0), (0, 1), (1, 0), (1, 1)] + subtileK=0 subIterK=0: + MFMAs (MT n, subtileK 0, subIterK 0): scaleSet=0 - USING A: {0: 0, 1: 1} B: {0: 2, 1: 3} - LR (MT n, subIterK 1) A: {0: 4, 1: 5} B: {0: 6, 1: 7} - WAIT_LR - subIterK=1: - MFMAs (MT n, subIterK 1): - - [(0, 0), (0, 1), (1, 0), (1, 1)] + before: [none] after: [none] + LR (MT n, subtileK 0, subIterK 1) A: {0: 4, 1: 5} B: {0: 6, 1: 7} + before: [none] after: [WaitLROp] + GR (MT n+2, subtileK 0): A: [0] B: [0] + before: [LR (MT n, subtileK 0, subIterK 1), WaitLROp, SyncOp] after: [none] + subtileK=0 subIterK=1: + MFMAs (MT n, subtileK 0, subIterK 1): scaleSet=0 - USING A: {0: 4, 1: 5} B: {0: 6, 1: 7} + before: [none] after: [none] + LR (MT n+1, subtileK 0, subIterK 0) A: {0: 0, 1: 1} B: {0: 2, 1: 3} scaleSet=1 + before: [WaitGROp(A=1 B=1 SA=0 SB=0), SyncOp, LR_INCOp] after: [WaitLROp] + GR (MT n+2, subtileK 0): A: [1] B: [1] + before: [none] after: [GR_INCOp] """ - assert actual == expected - + assert expected in actual def test_PGR2_64_64_2x2(): @@ -190,9 +226,7 @@ def test_PGR2_64_64_2x2(): lsgA = tiA.localSubtileGrid[0]//2 lsgB = tiB.localSubtileGrid[0]//2 - cfg = SchedulerConfig(lsgA, lsgB, PrefetchMode.HALF_PREFETCH, - VGPRTileReUseStrategy.ACROSS_SUBGROUP, - SubgroupOrdering.COLUMN_MAJOR) + cfg = SchedulerConfig(lsgA, lsgB, PrefetchMode.HALF_PREFETCH) s = SubtileBasedScheduler(tiA, tiB, cfg) assert len(s.preloopSteps) > 0 @@ -204,17 +238,15 @@ def test_PGR2_64_64_2x2(): buf = io.StringIO() with contextlib.redirect_stdout(buf): - s.printSchedule() + s.printSchedule(showVgpr=True, showDeps=True, showSubtiles=True) actual = buf.getvalue() expected = """\ -SubtileGridA=2, SubtileGridB=2 +SubtileGridA=2x1, SubtileGridB=2x1 Partition grid: 2 x 2 Partition size: 1 x 1 Prefetch: HALF_PREFETCH -Reuse: ACROSS_SUBGROUP -hasDuplicatedReads: False -needsUnrolling: False +Reuse: ACROSS_PARTITIONS totalVGPRTiles: 8 (32 VGPRs) totalScaleVGPRTiles: 0 hasScale: False @@ -224,231 +256,611 @@ def test_PGR2_64_64_2x2(): 1 3 PRELOOP: - GR (MT 0): A: [0] B: [0] - GR (MT 0): A: [1] B: [] - GR (MT 0): A: [] B: [1] + GR (MT 0, subtileK 0): A: [0] B: [0] + GR (MT 0, subtileK 0): A: [1] B: [] + GR (MT 0, subtileK 0): A: [] B: [1] GR_INC - WAIT_GR (MT 0) A: [0, 1] B: [0, 1] — inflight SubtileLoads A=0 B=0 + WAIT_GR (MT 0) A: [0, 1] B: [0, 1] — inflight SubtileLoads A=0 B=0 scaleA=0 scaleB=0 SYNC - LR (MT 0, subIterK 0) A: {0: 0} B: {0: 1} + LR (MT 0, subtileK 0, subIterK 0) A: {0: 0} B: {0: 1} WAIT_LR SKIP_IF_LE(1, NLL) - GR (MT 1): A: [0] B: [0] + GR (MT 1, subtileK 0): A: [0] B: [0] SKIP_IF_LE(2, NGLL) MAINLOOP: Partition 0: - subIterK=0: - MFMAs (MT n, subIterK 0): + subtileK=0 subIterK=0: + MFMAs (MT n, subtileK 0, subIterK 0): - [(0, 0)] - USING A: {0: 0} B: {0: 1} - LR (MT n, subIterK 1) A: {0: 2} B: {0: 3} - GR (MT n+1): A: [1] B: [] - WAIT_LR - subIterK=1: - MFMAs (MT n, subIterK 1): + before: [none] after: [none] + LR (MT n, subtileK 0, subIterK 1) A: {0: 2} B: {0: 3} + before: [none] after: [WaitLROp] + GR (MT n+1, subtileK 0): A: [1] B: [] + before: [none] after: [none] + subtileK=0 subIterK=1: + MFMAs (MT n, subtileK 0, subIterK 1): - [(0, 0)] - USING A: {0: 2} B: {0: 3} - WAIT_GR (MT n) A: [1] B: [] — inflight SubtileLoads A=2 B=2 - SYNC - LR (MT n, subIterK 0) A: {1: 4} B: {} - WAIT_LR + before: [none] after: [none] + LR (MT n, subtileK 0, subIterK 0) A: {1: 4} B: {} + before: [WaitGROp(A=2 B=2 SA=0 SB=0), SyncOp] after: [WaitLROp] Partition 1: - subIterK=0: - MFMAs (MT n, subIterK 0): + subtileK=0 subIterK=0: + MFMAs (MT n, subtileK 0, subIterK 0): - [(1, 0)] - USING A: {1: 4} B: {0: 1} - LR (MT n, subIterK 1) A: {1: 5} B: {} - GR (MT n+1): A: [] B: [1] - GR_INC - WAIT_LR - subIterK=1: - MFMAs (MT n, subIterK 1): + before: [none] after: [none] + LR (MT n, subtileK 0, subIterK 1) A: {1: 5} B: {} + before: [none] after: [WaitLROp] + GR (MT n+1, subtileK 0): A: [] B: [1] + before: [none] after: [GR_INCOp] + subtileK=0 subIterK=1: + MFMAs (MT n, subtileK 0, subIterK 1): - [(1, 0)] - USING A: {1: 5} B: {0: 3} - WAIT_GR (MT n) A: [] B: [1] — inflight SubtileLoads A=2 B=2 - SYNC - LR (MT n, subIterK 0) A: {} B: {1: 6} - WAIT_LR + before: [none] after: [none] + LR (MT n, subtileK 0, subIterK 0) A: {} B: {1: 6} + before: [WaitGROp(A=2 B=2 SA=0 SB=0), SyncOp] after: [WaitLROp] Partition 2: - subIterK=0: - MFMAs (MT n, subIterK 0): + subtileK=0 subIterK=0: + MFMAs (MT n, subtileK 0, subIterK 0): - [(0, 1)] - USING A: {0: 0} B: {1: 6} - LR (MT n, subIterK 1) A: {} B: {1: 7} - WAIT_LR - subIterK=1: - MFMAs (MT n, subIterK 1): + before: [none] after: [none] + LR (MT n, subtileK 0, subIterK 1) A: {} B: {1: 7} + before: [none] after: [WaitLROp] + subtileK=0 subIterK=1: + MFMAs (MT n, subtileK 0, subIterK 1): - [(0, 1)] - USING A: {0: 2} B: {1: 7} - LR (MT n, subIterK 0) A: {} B: {} - WAIT_LR + before: [none] after: [none] + LR (MT n, subtileK 0, subIterK 0) A: {} B: {} + before: [none] after: [WaitLROp] Partition 3: - subIterK=0: - MFMAs (MT n, subIterK 0): + subtileK=0 subIterK=0: + MFMAs (MT n, subtileK 0, subIterK 0): - [(1, 1)] - USING A: {1: 4} B: {1: 6} - LR (MT n, subIterK 1) A: {} B: {} - WAIT_LR - SYNC - GR (MT n+2): A: [0] B: [0] - subIterK=1: - MFMAs (MT n, subIterK 1): + before: [none] after: [none] + LR (MT n, subtileK 0, subIterK 1) A: {} B: {} + before: [none] after: [WaitLROp] + GR (MT n+2, subtileK 0): A: [0] B: [0] + before: [LR (MT n, subtileK 0, subIterK 1), WaitLROp, SyncOp] after: [none] + subtileK=0 subIterK=1: + MFMAs (MT n, subtileK 0, subIterK 1): - [(1, 1)] - USING A: {1: 5} B: {1: 7} - WAIT_GR (MT n+1) A: [0] B: [0] — inflight SubtileLoads A=2 B=2 - SYNC - LR_INC - LR (MT n+1, subIterK 0) A: {0: 0} B: {0: 1} - WAIT_LR + before: [none] after: [none] + LR (MT n+1, subtileK 0, subIterK 0) A: {0: 0} B: {0: 1} + before: [GR(MT n+1, subtileK 0), WaitGROp(A=2 B=2 SA=0 SB=0), SyncOp, LR_INCOp] after: [WaitLROp] +""" -NGLL (No Global Load Loop): - Partition 0: - subIterK=0: - MFMAs (MT n, subIterK 0): - - [(0, 0)] - - USING A: {0: 0} B: {0: 1} - LR (MT n, subIterK 1) A: {0: 2} B: {0: 3} - GR (MT n+1): A: [1] B: [] - WAIT_LR - subIterK=1: - MFMAs (MT n, subIterK 1): - - [(0, 0)] - - USING A: {0: 2} B: {0: 3} - WAIT_GR (MT n) A: [1] B: [] — inflight SubtileLoads A=0 B=0 - SYNC - LR (MT n, subIterK 0) A: {1: 4} B: {} - WAIT_LR - Partition 1: - subIterK=0: - MFMAs (MT n, subIterK 0): - - [(1, 0)] - - USING A: {1: 4} B: {0: 1} - LR (MT n, subIterK 1) A: {1: 5} B: {} - GR (MT n+1): A: [] B: [1] - WAIT_LR - subIterK=1: - MFMAs (MT n, subIterK 1): - - [(1, 0)] - - USING A: {1: 5} B: {0: 3} - WAIT_GR (MT n) A: [] B: [1] — inflight SubtileLoads A=0 B=0 - SYNC - LR (MT n, subIterK 0) A: {} B: {1: 6} - WAIT_LR - Partition 2: - subIterK=0: - MFMAs (MT n, subIterK 0): - - [(0, 1)] - - USING A: {0: 0} B: {1: 6} - LR (MT n, subIterK 1) A: {} B: {1: 7} - WAIT_LR - subIterK=1: - MFMAs (MT n, subIterK 1): - - [(0, 1)] - - USING A: {0: 2} B: {1: 7} - LR (MT n, subIterK 0) A: {} B: {} - WAIT_LR - Partition 3: - subIterK=0: - MFMAs (MT n, subIterK 0): - - [(1, 1)] - - USING A: {1: 4} B: {1: 6} - LR (MT n, subIterK 1) A: {} B: {} - WAIT_LR - SYNC - subIterK=1: - MFMAs (MT n, subIterK 1): - - [(1, 1)] - - USING A: {1: 5} B: {1: 7} - WAIT_GR (MT n+1) A: [0] B: [0] — inflight SubtileLoads A=0 B=0 - SYNC - LR_INC - LR (MT n+1, subIterK 0) A: {0: 0} B: {0: 1} - WAIT_LR + assert expected in actual + + +def _mod_sig(mod): + """Compact signature of an AnnotatedModule: (opType, before_ops, after_ops).""" + def _op_name(e): + if e.module: + op = e.module.op + if isinstance(op, LROp): + return f"LR(MT {op.mtIteration})" + if isinstance(op, GROp): + return f"GR(MT {op.mtIteration})" + return type(op).__name__ + return type(e.op).__name__ + op = mod.op + if isinstance(op, MFMAOp): + name = f"MFMA(sik {op.subIterK})" + elif isinstance(op, LROp): + name = f"LR(MT {op.mtIteration}, sik {op.subIterK})" + elif isinstance(op, GROp): + name = f"GR(MT {op.mtIteration})" + else: + name = type(op).__name__ + before = [_op_name(e) for e in mod.before] + after = [_op_name(e) for e in mod.after] + return (name, before, after) + + +def _step_sig(steps): + """Extract compact structural signature from loop steps.""" + result = [] + for pss in steps: + for dus in pss.subIterKSteps: + mods = [_mod_sig(m) for m in dus.modules] + result.append((pss.partitionId, dus.subIterK, mods)) + return result + + +def _create_1x1_scheduler(): + MT0 = MT1 = 64 + kernel = create_kernel(MT0, MT1) + tiA = TileInfo('A', kernel) + tiB = TileInfo('B', kernel) + lsgA = tiA.localSubtileGrid[0] + lsgB = tiB.localSubtileGrid[0] + cfg = SchedulerConfig(lsgA, lsgB, PrefetchMode.HALF_PREFETCH) + return SubtileBasedScheduler(tiA, tiB, cfg) + + +def test_PGR2_64_64_1x1_ngll(): + """NGLL removes GR(n+2) and GR_INC; orphaned deps from GR(n+2) are dropped.""" + s = _create_1x1_scheduler() + mainloop_sig = _step_sig(s.mainloopSteps) + ngll_sig = _step_sig(s.ngllSteps) + + # Same number of partitions and subIterK steps + assert len(ngll_sig) == len(mainloop_sig) + + for (pid, sik, mods) in ngll_sig: + # No GR(n+2) modules + assert not any("GR(MT n+2)" in name for name, _, _ in mods), \ + f"NGLL partition {pid} sik {sik} should not have GR(n+2)" + # No GR_INCOp in any after deps + for name, before, after in mods: + assert "GR_INCOp" not in after, \ + f"NGLL partition {pid} sik {sik} {name} should not have GR_INCOp in after" + # No SyncOp orphaned from removed GR(n+2) + for name, before, after in mods: + assert "SyncOp" not in after, \ + f"NGLL partition {pid} sik {sik} {name} should not have orphaned SyncOp in after" + + +def test_PGR2_64_64_1x1_nll(): + """NLL removes all GR, LR(n+1), GR_INC, LR_INC, WaitGR(n+1) and paired SyncOps.""" + s = _create_1x1_scheduler() + nll_sig = _step_sig(s.nllSteps) + + for (pid, sik, mods) in nll_sig: + for name, before, after in mods: + # No GR modules + assert not name.startswith("GR("), \ + f"NLL partition {pid} sik {sik} should not have {name}" + # No LR(n+1) modules + assert "MT n+1" not in name, \ + f"NLL partition {pid} sik {sik} should not have {name}" + # No GR_INC or LR_INC in deps + assert "GR_INCOp" not in before and "GR_INCOp" not in after, \ + f"NLL {name} should not have GR_INCOp" + assert "LR_INCOp" not in before, \ + f"NLL {name} should not have LR_INCOp in before" + + # subIterK=0: should still have MFMA + LR(MT n) + _, _, mods_sik0 = nll_sig[0] + op_names = [name for name, _, _ in mods_sik0] + assert any("MFMA" in n for n in op_names) + assert any("LR(MT n" in n for n in op_names) + + # subIterK=1: should have MFMA only (LR(n+1) removed, no GR) + _, _, mods_sik1 = nll_sig[1] + op_names = [name for name, _, _ in mods_sik1] + assert any("MFMA" in n for n in op_names) + assert not any(n.startswith("LR") for n in op_names), \ + f"NLL sik=1 should not have LR, got {op_names}" + + +def test_PGR2_64_64_1x1_emitted_modules_links(verbose=False): + MT0 = MT1 = 64 + kernel = create_kernel(MT0, MT1) + tiA = TileInfo('A', kernel) + tiB = TileInfo('B', kernel) + lsgA = tiA.localSubtileGrid[0] + lsgB = tiB.localSubtileGrid[0] + + cfg = SchedulerConfig(lsgA, lsgB, PrefetchMode.HALF_PREFETCH) + s = SubtileBasedScheduler(tiA, tiB, cfg) + writer = create_writer_with_tiles(kernel, tiA, tiB) + + s.allocVgprTiles(writer) + try: + dtileInfo = writer.states.d.tileInfo + pss = s.mainloopSteps[0] + emitted0 = s._buildEmittedModules(writer, kernel, pss.subIterKSteps[0].modules, dtileInfo) + emitted1 = s._buildEmittedModules(writer, kernel, pss.subIterKSteps[1].modules, dtileInfo) + finally: + s.deallocVgprTiles(writer) + + sig = lambda ems: [(em.moduleId, em.opType, len(em.instructions), em.before) for em in ems] + + assert sig(emitted0) == [ + (0, "mfma", 4, None), + (1, "lr", 4, None), + (2, "gr", 4, 4), + (3, "wait_lr", 1, 1), + (4, "sync", 1, 3), + ] + assert sig(emitted1) == [ + (0, "mfma", 4, None), + (1, "lr", 4, 5), + (2, "gr", 4, None), + (3, "wait_gr", 1, None), + (4, "sync", 1, 3), + (5, "lr_inc", 6, 4), + (6, "wait_lr", 1, 1), + (7, "gr_inc", 8, 2), + ] + + if verbose: + for label, emitted in [("subIterK=0", emitted0), ("subIterK=1", emitted1)]: + print(f" {label}:") + for em in emitted: + beforeStr = str(em.before) if em.before is not None else "-" + print(f" id={em.moduleId} {em.opType}: {len(em.instructions)} insts " + f"before=[{beforeStr}]") + + +def test_PGR2_256_256_1x1_extract_paths_from_before_deps(): + MT0 = MT1 = 256 + kernel = create_kernel(MT0, MT1) + tiA = TileInfo('A', kernel) + tiB = TileInfo('B', kernel) + lsgA = tiA.localSubtileGrid[0] + lsgB = tiB.localSubtileGrid[0] + + cfg = SchedulerConfig(lsgA, lsgB, PrefetchMode.HALF_PREFETCH) + s = SubtileBasedScheduler(tiA, tiB, cfg) + writer = create_writer_with_tiles(kernel, tiA, tiB) + + s.allocVgprTiles(writer) + try: + dtileInfo = writer.states.d.tileInfo + pss = s.mainloopSteps[0] + dus0 = pss.subIterKSteps[0] + dus1 = pss.subIterKSteps[1] + + emitted0 = s._buildEmittedModules(writer, kernel, dus0.modules, dtileInfo) + emitted1 = s._buildEmittedModules(writer, kernel, dus1.modules, dtileInfo) + finally: + s.deallocVgprTiles(writer) + + sig0 = [(em.moduleId, em.opType, len(em.instructions), em.before) for em in emitted0] + sig1 = [(em.moduleId, em.opType, len(em.instructions), em.before) for em in emitted1] + + assert sig0 == [ + (0, "mfma", 64, None), + (1, "lr", 16, None), + (2, "gr", 16, 4), + (3, "wait_lr", 1, 1), + (4, "sync", 1, 3), + ] + assert sig1 == [ + (0, "mfma", 64, None), + (1, "lr", 16, 5), + (2, "gr", 16, None), + (3, "wait_gr", 1, None), + (4, "sync", 1, 3), + (5, "lr_inc", 6, 4), + (6, "wait_lr", 1, 1), + (7, "gr_inc", 8, 2), + ] + + mfmaIdx0, pathOrders0 = SubtileBasedScheduler._extractPathsFromBeforeDeps(emitted0) + mfmaIdx1, pathOrders1 = SubtileBasedScheduler._extractPathsFromBeforeDeps(emitted1) + + assert mfmaIdx0 == 0 + assert pathOrders0 == [[1, 3, 4, 2]] + assert mfmaIdx1 == 0 + assert pathOrders1 == [[3, 4, 5, 1, 6], [2, 7]] + + +def _classify_inst(inst): + """Classify an instruction into a single-char type tag.""" + from rocisa.instruction import GlobalReadInstruction, LocalReadInstruction, MFMAInstruction + from Tensile.Components.SubtileBasedKernel import MXMFMAInstruction + if isinstance(inst, (MFMAInstruction, MXMFMAInstruction)): + return 'M' + if isinstance(inst, LocalReadInstruction): + return 'L' + if isinstance(inst, GlobalReadInstruction): + return 'G' + return 'S' + + +def _get_scheduled_instructions(scheduler, writer, kernel, subIterK): + """Build emitted modules for a subIterK and return the flat instruction list.""" + dtileInfo = writer.states.d.tileInfo + pss = scheduler.mainloopSteps[0] + dus = pss.subIterKSteps[subIterK] + emitted = scheduler._buildEmittedModules(writer, kernel, dus.modules, dtileInfo) + scheduled = SubtileBasedScheduler.instructionSchedule(emitted) + return scheduled.flatitems() + + +def _get_scheduled_sequence(scheduler, writer, kernel, subIterK, scaleTiA=None, scaleTiB=None): + """Build emitted modules for a subIterK and return the instruction-scheduled type sequence.""" + return ''.join(_classify_inst(i) for i in _get_scheduled_instructions(scheduler, writer, kernel, subIterK)) + + """Compute scheduling quality metrics from a type-tagged sequence string. + + Returns (exposed, spacings) where: + exposed - number of instructions beyond 2 per MFMA slot (0 = ideal) + spacings - list of MFMA-gap distances between consecutive buffer_loads + """ + # Split into per-MFMA-slot buckets + slots = [] + current = [] + for ch in seq: + if ch == 'M': + slots.append(current) + current = [] + else: + current.append(ch) + slots.append(current) # after last MFMA + + exposed = sum(max(0, len(s) - 2) for s in slots) + + # Buffer load spacing: distance in MFMA count between consecutive G's + mfma_idx = 0 + gr_mfma_positions = [] + for ch in seq: + if ch == 'M': + mfma_idx += 1 + elif ch == 'G': + gr_mfma_positions.append(mfma_idx) + spacings = [gr_mfma_positions[i + 1] - gr_mfma_positions[i] + for i in range(len(gr_mfma_positions) - 1)] + return exposed, spacings + +# help non-reg refactoring. To be replaced with a more relaxed test. +def test_PGR2_256_256_fp4_instruction_schedule_exact(): + """Exact regression test for the mainloop instruction schedule (fp4 256x256).""" + kernel = create_kernel(256, 256, fp4=True) + tiA = TileInfo('A', kernel) + tiB = TileInfo('B', kernel) + scaleTiA = TileInfo('MXSA', kernel) + scaleTiB = TileInfo('MXSB', kernel) + lsgA = tiA.localSubtileGrid[0] + lsgB = tiB.localSubtileGrid[0] -NLL (No Load Loop): + cfg = SchedulerConfig(lsgA, lsgB, PrefetchMode.HALF_PREFETCH) + s = SubtileBasedScheduler(tiA, tiB, cfg, + scaleTileInfoA=scaleTiA, scaleTileInfoB=scaleTiB) + writer = create_writer_with_tiles(kernel, tiA, tiB, + scaleTiA=scaleTiA, scaleTiB=scaleTiB) + s.allocVgprTiles(writer) + try: + seq0 = _get_scheduled_sequence(s, writer, kernel, 0) + seq1 = _get_scheduled_sequence(s, writer, kernel, 1) + finally: + s.deallocVgprTiles(writer) + + # M=MFMA, L=LocalRead, G=GlobalRead(buffer_load), S=scalar ALU/wait/sync + expected_sik0 = \ + "MLMLMLMLMLMLMLMLMLMLMLMLMLMLMLMLMMMMSSMSMGMSMMMMGMSMMMMGMSMMMMGMSMMMMGM" \ + "SMMMMGMSMMMMGMSMMMMGMMMMMMM" + expected_sik1 = \ + "MSSMGSMSMMMMGMSMMMMGMSMMMMGMSMMMMGMSMMMMGMSMMSMSSMSGSMSSSMSSMSSMSLMGLMS" \ + "LMLMLMLMGLMSLMLMLMLMGLSMSLSMSLSMSLSMSLSMSLSMSLSMSLSMSLSMSLMLMLMLMMMMSM" + + assert seq0 == expected_sik0, f"subIterK=0 mismatch:\n got: {seq0}\n exp: {expected_sik0}" + assert seq1 == expected_sik1, f"subIterK=1 mismatch:\n got: {seq1}\n exp: {expected_sik1}" + + +def test_PGR2_128_128_DU512_fp4_schedule(): + """Exact schedule test for 128x128 DU512 fp4.""" + kernel = create_kernel(128, 128, fp4=True, depthU=512) + tiA = TileInfo('A', kernel) + tiB = TileInfo('B', kernel) + scaleTiA = TileInfo('MXSA', kernel) + scaleTiB = TileInfo('MXSB', kernel) + lsgA = tiA.localSubtileGrid[0] + lsgB = tiB.localSubtileGrid[0] + + cfg = SchedulerConfig(lsgA, lsgB, PrefetchMode.HALF_PREFETCH) + s = SubtileBasedScheduler(tiA, tiB, cfg, + scaleTileInfoA=scaleTiA, scaleTileInfoB=scaleTiB) + + assert s.totalVGPRTiles == 32 + assert s.totalScaleVGPRTiles == 4 + assert s.hasScale + + buf = io.StringIO() + with contextlib.redirect_stdout(buf): + s.printSchedule(showVgpr=True, showDeps=True, showSubtiles=True) + actual = buf.getvalue() + + expected = """\ +SubtileGridA=4x2, SubtileGridB=4x2 +Partition grid: 1 x 1 +Partition size: 4 x 4 +Prefetch: HALF_PREFETCH +Reuse: ACROSS_PARTITIONS +totalVGPRTiles: 32 (128 VGPRs) +totalScaleVGPRTiles: 4 +hasScale: True + +Ordering grid (COLUMN_MAJOR): + 0 + +PRELOOP: + GR (MT 0, subtileK 0): A: [0, 1, 2, 3] B: [0, 1, 2, 3] + GR (MT 0, subtileK 1): A: [0, 1, 2, 3] B: [0, 1, 2, 3] + GR_INC + WAIT_GR (MT 0) A: [0, 1, 2, 3] B: [0, 1, 2, 3] — inflight SubtileLoads A=0 B=0 scaleA=0 scaleB=0 + SYNC + LR (MT 0, subtileK 0, subIterK 0) A: {0: 0, 1: 1, 2: 2, 3: 3} B: {0: 4, 1: 5, 2: 6, 3: 7} scaleSet=0 + WAIT_LR + SKIP_IF_LE(1, NLLEarly) + GR (MT 1, subtileK 0): A: [0, 1, 2, 3] B: [0, 1, 2, 3] + GR (MT 1, subtileK 1): A: [0, 1, 2, 3] B: [0, 1, 2, 3] + GR_INC + SKIP_IF_LE(2, NGLL) + +MAINLOOP: Partition 0: - subIterK=0: - MFMAs (MT n, subIterK 0): - - [(0, 0)] - - USING A: {0: 0} B: {0: 1} - LR (MT n, subIterK 1) A: {0: 2} B: {0: 3} - WAIT_LR - subIterK=1: - MFMAs (MT n, subIterK 1): - - [(0, 0)] - - USING A: {0: 2} B: {0: 3} - WAIT_GR (MT n) A: [1] B: [] — inflight SubtileLoads A=0 B=0 - SYNC - LR (MT n, subIterK 0) A: {1: 4} B: {} - WAIT_LR - Partition 1: - subIterK=0: - MFMAs (MT n, subIterK 0): - - [(1, 0)] - - USING A: {1: 4} B: {0: 1} - LR (MT n, subIterK 1) A: {1: 5} B: {} - WAIT_LR - subIterK=1: - MFMAs (MT n, subIterK 1): - - [(1, 0)] - - USING A: {1: 5} B: {0: 3} - WAIT_GR (MT n) A: [] B: [1] — inflight SubtileLoads A=0 B=0 - SYNC - LR (MT n, subIterK 0) A: {} B: {1: 6} - WAIT_LR - Partition 2: - subIterK=0: - MFMAs (MT n, subIterK 0): - - [(0, 1)] - - USING A: {0: 0} B: {1: 6} - LR (MT n, subIterK 1) A: {} B: {1: 7} - WAIT_LR - subIterK=1: - MFMAs (MT n, subIterK 1): - - [(0, 1)] - - USING A: {0: 2} B: {1: 7} - LR (MT n, subIterK 0) A: {} B: {} - WAIT_LR - Partition 3: - subIterK=0: - MFMAs (MT n, subIterK 0): - - [(1, 1)] - - USING A: {1: 4} B: {1: 6} - LR (MT n, subIterK 1) A: {} B: {} - WAIT_LR - subIterK=1: - MFMAs (MT n, subIterK 1): - - [(1, 1)] - - USING A: {1: 5} B: {1: 7} + subtileK=0 subIterK=0: + MFMAs (MT n, subtileK 0, subIterK 0): scaleSet=0 + - [(0, 0), (0, 1), (0, 2), (0, 3), (1, 0), (1, 1), (1, 2), (1, 3), (2, 0), (2, 1), (2, 2), (2, 3), (3, 0), (3, 1), (3, 2), (3, 3)] + - USING A: {0: 0, 1: 1, 2: 2, 3: 3} B: {0: 4, 1: 5, 2: 6, 3: 7} + before: [none] after: [none] + LR (MT n, subtileK 0, subIterK 1) A: {0: 8, 1: 9, 2: 10, 3: 11} B: {0: 12, 1: 13, 2: 14, 3: 15} + before: [none] after: [WaitLROp] + GR (MT n+2, subtileK 0): A: [0, 1] B: [0, 1] + before: [LR (MT n, subtileK 0, subIterK 1), WaitLROp, SyncOp] after: [none] + subtileK=0 subIterK=1: + MFMAs (MT n, subtileK 0, subIterK 1): scaleSet=0 + - [(0, 0), (0, 1), (0, 2), (0, 3), (1, 0), (1, 1), (1, 2), (1, 3), (2, 0), (2, 1), (2, 2), (2, 3), (3, 0), (3, 1), (3, 2), (3, 3)] + - USING A: {0: 8, 1: 9, 2: 10, 3: 11} B: {0: 12, 1: 13, 2: 14, 3: 15} + before: [none] after: [none] + LR (MT n, subtileK 1, subIterK 0) A: {0: 16, 1: 17, 2: 18, 3: 19} B: {0: 20, 1: 21, 2: 22, 3: 23} scaleSet=1 + before: [none] after: [WaitLROp] + GR (MT n+2, subtileK 0): A: [2, 3] B: [2, 3] + before: [LR (MT n, subtileK 1, subIterK 0), WaitLROp, SyncOp] after: [none] + subtileK=1 subIterK=0: + MFMAs (MT n, subtileK 1, subIterK 0): scaleSet=1 + - [(0, 0), (0, 1), (0, 2), (0, 3), (1, 0), (1, 1), (1, 2), (1, 3), (2, 0), (2, 1), (2, 2), (2, 3), (3, 0), (3, 1), (3, 2), (3, 3)] + - USING A: {0: 16, 1: 17, 2: 18, 3: 19} B: {0: 20, 1: 21, 2: 22, 3: 23} + before: [none] after: [none] + LR (MT n, subtileK 1, subIterK 1) A: {0: 24, 1: 25, 2: 26, 3: 27} B: {0: 28, 1: 29, 2: 30, 3: 31} + before: [none] after: [WaitLROp] + GR (MT n+2, subtileK 1): A: [0, 1] B: [0, 1] + before: [LR (MT n, subtileK 1, subIterK 1), WaitLROp, SyncOp] after: [none] + subtileK=1 subIterK=1: + MFMAs (MT n, subtileK 1, subIterK 1): scaleSet=1 + - [(0, 0), (0, 1), (0, 2), (0, 3), (1, 0), (1, 1), (1, 2), (1, 3), (2, 0), (2, 1), (2, 2), (2, 3), (3, 0), (3, 1), (3, 2), (3, 3)] + - USING A: {0: 24, 1: 25, 2: 26, 3: 27} B: {0: 28, 1: 29, 2: 30, 3: 31} + before: [none] after: [none] + LR (MT n+1, subtileK 0, subIterK 0) A: {0: 0, 1: 1, 2: 2, 3: 3} B: {0: 4, 1: 5, 2: 6, 3: 7} scaleSet=0 + before: [WaitGROp(A=6 B=6 SA=0 SB=0), SyncOp, LR_INCOp] after: [WaitLROp] + GR (MT n+2, subtileK 1): A: [2, 3] B: [2, 3] + before: [none] after: [GR_INCOp] """ - assert actual == expected + assert expected in actual + + +def test_PGR2_128_128_DU512_fp4_instruction_schedule_exact(): + """Exact regression test for the mainloop instruction schedule (fp4 128x128 DU512).""" + kernel = create_kernel(128, 128, fp4=True, depthU=512) + tiA = TileInfo('A', kernel) + tiB = TileInfo('B', kernel) + scaleTiA = TileInfo('MXSA', kernel) + scaleTiB = TileInfo('MXSB', kernel) + lsgA = tiA.localSubtileGrid[0] + lsgB = tiB.localSubtileGrid[0] + + cfg = SchedulerConfig(lsgA, lsgB, PrefetchMode.HALF_PREFETCH) + s = SubtileBasedScheduler(tiA, tiB, cfg, + scaleTileInfoA=scaleTiA, scaleTileInfoB=scaleTiB) + writer = create_writer_with_tiles(kernel, tiA, tiB, + scaleTiA=scaleTiA, scaleTiB=scaleTiB) + s.allocVgprTiles(writer) + try: + seq0 = _get_scheduled_sequence(s, writer, kernel, 0) + seq1 = _get_scheduled_sequence(s, writer, kernel, 1) + seq2 = _get_scheduled_sequence(s, writer, kernel, 2) + seq3 = _get_scheduled_sequence(s, writer, kernel, 3) + finally: + s.deallocVgprTiles(writer) + + # M=MFMA, L=LocalRead, G=GlobalRead(buffer_load), S=scalar ALU/wait/sync + expected_sik0 = "MLMLMLMLMLMLMLMLMMMMSSMSMGMSGSGSGM" + expected_sik1 = "MLMLMLMLMLMLMLMLMLMLMLMLSMSSMGMSGSGSGM" + expected_sik2 = "MLMLMLMLMLMLMLMLMMMMSSMSMGMSGSGSGM" + expected_sik3 = "MSSSSSSSSSSSSLLSMSLMGLSMSLMGLMSLMGLMSLMGLMSLMGLMSMGSMSSMSSSSSSSSSSSSSSSSM" + + assert seq0 == expected_sik0, f"subIterK=0 mismatch:\n got: {seq0}\n exp: {expected_sik0}" + assert seq1 == expected_sik1, f"subIterK=1 mismatch:\n got: {seq1}\n exp: {expected_sik1}" + assert seq2 == expected_sik2, f"subIterK=2 mismatch:\n got: {seq2}\n exp: {expected_sik2}" + assert seq3 == expected_sik3, f"subIterK=3 mismatch:\n got: {seq3}\n exp: {expected_sik3}" + + +def test_PGR2_256_256_fp4_vmcnt(): + """Verify vmcnt values in SWaitCnt instructions for 256x256 fp4 mainloop. + + The scheduler's post-pass sets vlcnt = initial_inflight + buffer_loads_before_wait. + We verify that for each SWaitCnt with vlcnt >= 0, the value equals the number + of buffer_load instructions that appear before it in the scheduled sequence. + The initial inflight count (from prior subIterK GRs) is baked into the + pre-adjustment vlcnt by emitWaitGR, so the final vlcnt must be at least + the number of buffer_loads placed before the wait in this subIterK. + """ + from rocisa.instruction import SWaitCnt, GlobalReadInstruction + + kernel = create_kernel(256, 256, fp4=True) + tiA = TileInfo('A', kernel) + tiB = TileInfo('B', kernel) + scaleTiA = TileInfo('MXSA', kernel) + scaleTiB = TileInfo('MXSB', kernel) + lsgA = tiA.localSubtileGrid[0] + lsgB = tiB.localSubtileGrid[0] + + cfg = SchedulerConfig(lsgA, lsgB, PrefetchMode.HALF_PREFETCH) + s = SubtileBasedScheduler(tiA, tiB, cfg, + scaleTileInfoA=scaleTiA, scaleTileInfoB=scaleTiB) + writer = create_writer_with_tiles(kernel, tiA, tiB, + scaleTiA=scaleTiA, scaleTiB=scaleTiB) + s.allocVgprTiles(writer) + try: + for sik in range(len(s.mainloopSteps[0].subIterKSteps)): + insts = _get_scheduled_instructions(s, writer, kernel, sik) + + # Walk instructions: count buffer_loads before each SWaitCnt + buf_loads_before = 0 + for inst in insts: + if isinstance(inst, GlobalReadInstruction): + buf_loads_before += 1 + elif isinstance(inst, SWaitCnt) and inst.vlcnt >= 0: + # vlcnt must be >= buffer_loads placed before this wait + # (the difference is the initial inflight from prior GRs) + assert inst.vlcnt >= buf_loads_before, \ + f"subIterK={sik}: SWaitCnt vlcnt={inst.vlcnt} < " \ + f"buf_loads_before={buf_loads_before}" + finally: + s.deallocVgprTiles(writer) if __name__ == "__main__": - MT0=MT1=256 - kernel = create_kernel(MT0,MT1) + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--fp4", action="store_true", help="Enable FP4 path with MX scales") + parser.add_argument("--du", type=int, default=None, help="DepthU override (default: 256 for fp4, 64 otherwise)") + args = parser.parse_args() + + MT0=MT1=64 + kernel = create_kernel(MT0, MT1, fp4=args.fp4, depthU=args.du) tiA = TileInfo('A', kernel) tiB = TileInfo('B', kernel) - # 2x2 partition grid - lsgA = tiA.localSubtileGrid[0]//2 - lsgB = tiB.localSubtileGrid[0]//2 + print("TileInfo A:", tiA) - cfg = SchedulerConfig(lsgA, lsgB, PrefetchMode.HALF_PREFETCH, - VGPRTileReUseStrategy.ACROSS_SUBGROUP, - SubgroupOrdering.COLUMN_MAJOR) - s = SubtileBasedScheduler(tiA, tiB, cfg) + scaleTiA = TileInfo('MXSA', kernel) if args.fp4 else None + scaleTiB = TileInfo('MXSB', kernel) if args.fp4 else None + + lsgA = tiA.localSubtileGrid[0] + lsgB = tiB.localSubtileGrid[0] + + cfg = SchedulerConfig(lsgA, lsgB, PrefetchMode.HALF_PREFETCH) + s = SubtileBasedScheduler(tiA, tiB, cfg, + scaleTileInfoA=scaleTiA, scaleTileInfoB=scaleTiB) assert len(s.preloopSteps) > 0 assert len(s.mainloopSteps) > 0 assert len(s.ngllSteps) > 0 assert len(s.nllSteps) > 0 - writer = create_writer_with_tiles(kernel, tiA, tiB) + writer = create_writer_with_tiles(kernel, tiA, tiB, + scaleTiA=scaleTiA, scaleTiB=scaleTiB) + print("=== DEFAULT ===") s.printSchedule() - s.generateCode(writer, kernel) + print("\n=== VGPR + DEPS ===") + s.printSchedule(showVgpr=False, showDeps=True, showSubtiles=False) + # s.printSchedule() + + s.allocVgprTiles(writer) + + preloop = s._emitLoop(writer, kernel, "PRELOOP", s.preloopSteps) + mainloop = s._emitLoop(writer, kernel, "MAINLOOP", s.mainloopSteps) + ngll = Module("NGLL") + ngll.add(Label("SkipToNGLL", "")) + ngll.add(s._emitLoop(writer, kernel, "NGLL", s.ngllSteps)) + nll = Module("NLL") + nll.add(Label("SkipToNLL", "")) + nll.add(s._emitLoop(writer, kernel, "NLL", s.nllSteps)) + print(mainloop) # kernel = create_kernel() # tiA = TileInfo('A', kernel) # tiB = TileInfo('B', kernel) @@ -456,10 +868,10 @@ def test_PGR2_64_64_2x2(): # lsgB = tiB.localSubtileGrid[0] # configs = [ - # # (f"lsg {lsgA}x{lsgB}, group {lsgA}x{lsgB}, HALF_PREFETCH, ACROSS_SUBGROUP, COLUMN_MAJOR", - # # SchedulerConfig(lsgA, lsgB, PrefetchMode.HALF_PREFETCH, VGPRTileReUseStrategy.ACROSS_SUBGROUP, SubgroupOrdering.COLUMN_MAJOR)), - # (f"lsg {lsgA}x{lsgB}, group {lsgA//2}x{lsgB//2}, HALF_PREFETCH, ACROSS_SUBGROUP, COLUMN_MAJOR", - # SchedulerConfig(lsgA//2, lsgB//2, PrefetchMode.HALF_PREFETCH, VGPRTileReUseStrategy.ACROSS_SUBGROUP, SubgroupOrdering.COLUMN_MAJOR)), + # # (f"lsg {lsgA}x{lsgB}, group {lsgA}x{lsgB}, HALF_PREFETCH, ACROSS_PARTITIONS, COLUMN_MAJOR", + # # SchedulerConfig(lsgA, lsgB, PrefetchMode.HALF_PREFETCH, VGPRTileReUseStrategy.ACROSS_PARTITIONS, PartitionOrdering.COLUMN_MAJOR)), + # (f"lsg {lsgA}x{lsgB}, group {lsgA//2}x{lsgB//2}, HALF_PREFETCH, ACROSS_PARTITIONS, COLUMN_MAJOR", + # SchedulerConfig(lsgA//2, lsgB//2, PrefetchMode.HALF_PREFETCH, VGPRTileReUseStrategy.ACROSS_PARTITIONS, PartitionOrdering.COLUMN_MAJOR)), # ] # for name, cfg in configs: @@ -468,4 +880,4 @@ def test_PGR2_64_64_2x2(): # s.printSchedule() # writer = create_writer_with_tiles(kernel, tiA, tiB) # # s.generateCode(writer, kernel) - # # print() \ No newline at end of file + # # print()