diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml index 90ae18c9b5d..26c71b0b4c3 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs.yaml @@ -18090,7 +18090,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 + BaseName: Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 BufferLoad: true BufferStore: true CUCount: null @@ -18098,7 +18098,7 @@ ClusterLocalRead: 1 CodeObjectVersion: '4' ConvertAfterDS: false - CustomKernelName: Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 + CustomKernelName: Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 DebugStreamK: 0 DepthU: 64 DirectToLds: true @@ -18133,7 +18133,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 + KernelNameMin: Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -18236,7 +18236,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 80 - SolutionNameMin: Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 + SolutionNameMin: Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 SourceSwap: false StaggerU: 0 StaggerUMapping: 0 diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bljk_HHS_BH_BiasSH_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bljk_HHS_BH_BiasSH_HAS_SAV_UserArgs.yaml index 51d02696156..9d75690b863 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bljk_HHS_BH_BiasSH_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bljk_HHS_BH_BiasSH_HAS_SAV_UserArgs.yaml @@ -18090,7 +18090,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 + BaseName: Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 BufferLoad: true BufferStore: true CUCount: null @@ -18098,7 +18098,7 @@ ClusterLocalRead: 1 CodeObjectVersion: '4' ConvertAfterDS: false - CustomKernelName: Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 + CustomKernelName: Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 DebugStreamK: 0 DepthU: 64 DirectToLds: true @@ -18133,7 +18133,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 + KernelNameMin: Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -18236,7 +18236,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 80 - SolutionNameMin: Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 + SolutionNameMin: Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 SourceSwap: false StaggerU: 0 StaggerUMapping: 0 diff --git a/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950.s b/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950.s similarity index 99% rename from projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950.s rename to projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950.s index 1002ce43d9a..2b1223f0062 100644 --- a/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950.s +++ b/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950.s @@ -4,13 +4,13 @@ /******************************************/ .amdgcn_target "amdgcn-amd-amdhsa--gfx950" .text -.protected Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 -.globl Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 +.protected Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 +.globl Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 .p2align 8 -.type Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950,@function +.type Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950,@function .section .rodata,#alloc .p2align 6 -.amdhsa_kernel Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 +.amdhsa_kernel Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_accum_offset 256 // accvgpr offset .amdhsa_next_free_vgpr 512 // vgprs @@ -96,8 +96,8 @@ amdhsa.version: - 1 - 1 amdhsa.kernels: - - .name: Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 - .symbol: 'Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950.kd' + - .name: Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 + .symbol: 'Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950.kd' .language: OpenCL C .language_version: - 2 @@ -338,7 +338,7 @@ amdhsa.kernels: .wavefront_size: 64 ... .end_amdgpu_metadata -Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950: +Custom_Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950: label_ASM_Start: /// Main body of the asm kernel .macro V_MAGIC_DIV vgprDstIdx:req, dividend:req, magicNumber:req, magicShift:req, magicA:req v_mul_hi_u32 v[\vgprDstIdx+1], \dividend, \magicNumber diff --git a/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950.s b/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950.s similarity index 99% rename from projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950.s rename to projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950.s index e9a803b5b89..ad6850d022b 100644 --- a/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950.s +++ b/projects/hipblaslt/tensilelite/Tensile/CustomKernels/Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950.s @@ -4,13 +4,13 @@ /******************************************/ .amdgcn_target "amdgcn-amd-amdhsa--gfx950" .text -.protected Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 -.globl Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 +.protected Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 +.globl Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 .p2align 8 -.type Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950,@function +.type Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950,@function .section .rodata,#alloc .p2align 6 -.amdhsa_kernel Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 +.amdhsa_kernel Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_accum_offset 256 // accvgpr offset .amdhsa_next_free_vgpr 512 // vgprs @@ -96,8 +96,8 @@ amdhsa.version: - 1 - 1 amdhsa.kernels: - - .name: Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 - .symbol: 'Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950.kd' + - .name: Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950 + .symbol: 'Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950.kd' .language: OpenCL C .language_version: - 2 @@ -338,7 +338,7 @@ amdhsa.kernels: .wavefront_size: 64 ... .end_amdgpu_metadata -Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950: +Custom_Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x64_MI16x16x1_shortname0_gfx950: label_ASM_Start: /// Main body of the asm kernel .macro V_MAGIC_DIV vgprDstIdx:req, dividend:req, magicNumber:req, magicShift:req, magicA:req v_mul_hi_u32 v[\vgprDstIdx+1], \dividend, \magicNumber