From 05178d4a3659b5bcf610f6ddf2feaace5fcb1528 Mon Sep 17 00:00:00 2001 From: Harkirat Gill Date: Tue, 10 Mar 2026 10:49:46 -0400 Subject: [PATCH 1/2] Initial support for gfx90c --- projects/rocblas/library/src/handle.cpp | 4 ++ .../rocblas/library/src/include/handle.hpp | 1 + projects/rocblas/library/src/tensile_host.cpp | 4 ++ shared/tensile/Tensile/AsmCaps.py | 44 +++++++++++++++++++ shared/tensile/Tensile/Common.py | 4 +- .../cmake/TensileSupportedArchitectures.cmake | 1 + .../Source/lib/include/Tensile/AMDGPU.hpp | 7 +++ .../include/Tensile/PlaceholderLibrary.hpp | 3 ++ .../Tensile/Serialization/Predicates.hpp | 1 + .../cmake/TensileSupportedArchitectures.cmake | 1 + 10 files changed, 68 insertions(+), 2 deletions(-) diff --git a/projects/rocblas/library/src/handle.cpp b/projects/rocblas/library/src/handle.cpp index 45b3d857662..bb354b889a7 100644 --- a/projects/rocblas/library/src/handle.cpp +++ b/projects/rocblas/library/src/handle.cpp @@ -386,6 +386,10 @@ Processor _rocblas_handle::getActiveArch() { return Processor::gfx900; } + else if(deviceString.find("gfx90c") != std::string::npos) + { + return Processor::gfx90c; + } else if(deviceString.find("gfx906") != std::string::npos) { return Processor::gfx906; diff --git a/projects/rocblas/library/src/include/handle.hpp b/projects/rocblas/library/src/include/handle.hpp index 501deeccc3d..ef65bc40c39 100644 --- a/projects/rocblas/library/src/include/handle.hpp +++ b/projects/rocblas/library/src/include/handle.hpp @@ -79,6 +79,7 @@ enum class Processor : int // only including supported types gfx803 = 803, gfx900 = 900, + gfx90c = 912, gfx906 = 906, gfx908 = 908, gfx90a = 910, diff --git a/projects/rocblas/library/src/tensile_host.cpp b/projects/rocblas/library/src/tensile_host.cpp index e02abe86e92..929a03ee15f 100644 --- a/projects/rocblas/library/src/tensile_host.cpp +++ b/projects/rocblas/library/src/tensile_host.cpp @@ -239,6 +239,10 @@ namespace { return Tensile::LazyLoadingInit::gfx900; } + else if(deviceString.find("gfx90c") != std::string::npos) + { + return Tensile::LazyLoadingInit::gfx90c; + } else if(deviceString.find("gfx906") != std::string::npos) { return Tensile::LazyLoadingInit::gfx906; diff --git a/shared/tensile/Tensile/AsmCaps.py b/shared/tensile/Tensile/AsmCaps.py index 48eeec1f9a6..bee9b87793f 100644 --- a/shared/tensile/Tensile/AsmCaps.py +++ b/shared/tensile/Tensile/AsmCaps.py @@ -301,6 +301,50 @@ def getCapabilitiesCache(rocmVersion: NamedTuple) -> dict: 'v_mov_b64': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False}, + (9, 0, 12): {'HasAddLshl': True, + 'HasAtomicAdd': False, + 'HasDirectToLdsDest': False, + 'HasDirectToLdsNoDest': True, + 'HasExplicitCO': True, + 'HasExplicitNC': False, + 'HasGLCModifier': True, + 'HasNTModifier': False, + 'HasLshlOr': True, + 'HasMFMA': False, + 'HasMFMA_b8': False, + 'HasMFMA_bf16_1k': False, + 'HasMFMA_bf16_original': False, + 'HasMFMA_constSrc': False, + 'HasMFMA_f64': False, + 'HasMFMA_f8': False, + 'HasMFMA_i8_908': False, + 'HasMFMA_i8_940': False, + 'HasMFMA_vgpr': False, + 'HasMFMA_xf32': False, + 'HasSMulHi': True, + 'HasWMMA': False, + 'KernargPreloading': False, + 'MaxLgkmcnt': 15, + 'MaxVmcnt': 63, + 'SupportedISA': True, + 'SupportedSource': True, + 'VOP3v_dot4_i32_i8': False, + 'v_dot2_f32_f16': False, + 'v_dot2c_f32_f16': False, + 'v_dot4_i32_i8': False, + 'v_dot4c_i32_i8': False, + 'v_fma_f16': True, + 'v_fma_f32': True, + 'v_fma_f64': True, + 'v_fma_mix_f32': False, + 'v_fmac_f16': False, + 'v_fmac_f32': False, + 'v_mac_f16': True, + 'v_mac_f32': True, + 'v_mad_mix_f32': True, + 'v_mov_b64': False, + 'v_pk_fma_f16': True, + 'v_pk_fmac_f16': False}, (9, 4, 2): {'HasAddLshl': True, 'HasAtomicAdd': True, 'HasDirectToLdsDest': False, diff --git a/shared/tensile/Tensile/Common.py b/shared/tensile/Tensile/Common.py index 220430061b2..1f2a2a0133d 100644 --- a/shared/tensile/Tensile/Common.py +++ b/shared/tensile/Tensile/Common.py @@ -246,7 +246,7 @@ class DeveloperWarning(Warning): globalParameters["MaxFileName"] = 64 # If a file name would be longer than this, shorten it with a hash. globalParameters["SupportedISA"] = [(8,0,3), - (9,0,0), (9,0,6), (9,0,8), (9,0,10), + (9,0,0), (9,0,6), (9,0,8), (9,0,10), (9,0,12), (9,4,2), (9,5,0), (10,1,0), (10,1,1), (10,1,2), (10,3,0), (10,3,1), (10,3,2), (10,3,3), (10,3,4), (10,3,5), (10,3,6), (11,0,0), (11,0,1), (11,0,2), (11,0,3), @@ -317,7 +317,7 @@ class DeveloperWarning(Warning): # Translate GPU targets to filter filenames in Tensile_LOGIC directory architectureMap = { 'all':'_', 'gfx000':'none', 'fallback':'hip', - 'gfx803':'r9nano', 'gfx900':'vega10', 'gfx900:xnack-':'vega10', + 'gfx803':'r9nano', 'gfx900':'vega10', 'gfx900:xnack-':'vega10', 'gfx90c':'vega10', 'gfx906':'vega20', 'gfx906:xnack+':'vega20', 'gfx906:xnack-':'vega20', 'gfx908':'arcturus','gfx908:xnack+':'arcturus', 'gfx908:xnack-':'arcturus', 'gfx90a':'aldebaran', 'gfx90a:xnack+':'aldebaran', 'gfx90a:xnack-':'aldebaran', diff --git a/shared/tensile/Tensile/Source/cmake/TensileSupportedArchitectures.cmake b/shared/tensile/Tensile/Source/cmake/TensileSupportedArchitectures.cmake index a1fb7166cf6..c24f8241652 100644 --- a/shared/tensile/Tensile/Source/cmake/TensileSupportedArchitectures.cmake +++ b/shared/tensile/Tensile/Source/cmake/TensileSupportedArchitectures.cmake @@ -32,6 +32,7 @@ if(NOT BUILD_ADDRESS_SANITIZER) list(APPEND BASE_ARCHITECTURES "gfx803" "gfx900" + "gfx90c" "gfx906" "gfx908" "gfx90a" diff --git a/shared/tensile/Tensile/Source/lib/include/Tensile/AMDGPU.hpp b/shared/tensile/Tensile/Source/lib/include/Tensile/AMDGPU.hpp index 1d22bfe712d..1c35bfaf157 100644 --- a/shared/tensile/Tensile/Source/lib/include/Tensile/AMDGPU.hpp +++ b/shared/tensile/Tensile/Source/lib/include/Tensile/AMDGPU.hpp @@ -57,6 +57,7 @@ namespace Tensile //gfx802 = 3, gfx803 = 803, gfx900 = 900, + gfx90c = 912, gfx906 = 906, gfx908 = 908, gfx90a = 910, @@ -92,6 +93,8 @@ namespace Tensile return "gfx803"; case AMDGPU::Processor::gfx900: return "gfx900"; + case AMDGPU::Processor::gfx90c: + return "gfx90c"; case AMDGPU::Processor::gfx906: return "gfx906"; case AMDGPU::Processor::gfx908: @@ -156,6 +159,10 @@ namespace Tensile { return AMDGPU::Processor::gfx900; } + else if(deviceString.find("gfx90c") != std::string::npos) + { + return AMDGPU::Processor::gfx90c; + } else if(deviceString.find("gfx906") != std::string::npos) { return AMDGPU::Processor::gfx906; diff --git a/shared/tensile/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp b/shared/tensile/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp index c164bde1c13..2967da38e25 100644 --- a/shared/tensile/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp +++ b/shared/tensile/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp @@ -41,6 +41,7 @@ namespace Tensile None, gfx803, gfx900, + gfx90c, gfx906, gfx908, gfx90a, @@ -80,6 +81,8 @@ namespace Tensile return "TensileLibrary_*_gfx803"; case LazyLoadingInit::gfx900: return "TensileLibrary_*_gfx900"; + case LazyLoadingInit::gfx90c: + return "TensileLibrary_*_gfx90c"; case LazyLoadingInit::gfx906: return "TensileLibrary_*_gfx906"; case LazyLoadingInit::gfx908: diff --git a/shared/tensile/Tensile/Source/lib/include/Tensile/Serialization/Predicates.hpp b/shared/tensile/Tensile/Source/lib/include/Tensile/Serialization/Predicates.hpp index 82e498b4fa7..3953f5002b8 100644 --- a/shared/tensile/Tensile/Source/lib/include/Tensile/Serialization/Predicates.hpp +++ b/shared/tensile/Tensile/Source/lib/include/Tensile/Serialization/Predicates.hpp @@ -217,6 +217,7 @@ namespace Tensile { iot::enumCase(io, value, "gfx803", AMDGPU::Processor::gfx803); iot::enumCase(io, value, "gfx900", AMDGPU::Processor::gfx900); + iot::enumCase(io, value, "gfx90c", AMDGPU::Processor::gfx90c); iot::enumCase(io, value, "gfx906", AMDGPU::Processor::gfx906); iot::enumCase(io, value, "gfx908", AMDGPU::Processor::gfx908); iot::enumCase(io, value, "gfx90a", AMDGPU::Processor::gfx90a); diff --git a/shared/tensile/next-cmake/cmake/TensileSupportedArchitectures.cmake b/shared/tensile/next-cmake/cmake/TensileSupportedArchitectures.cmake index a1fb7166cf6..c24f8241652 100644 --- a/shared/tensile/next-cmake/cmake/TensileSupportedArchitectures.cmake +++ b/shared/tensile/next-cmake/cmake/TensileSupportedArchitectures.cmake @@ -32,6 +32,7 @@ if(NOT BUILD_ADDRESS_SANITIZER) list(APPEND BASE_ARCHITECTURES "gfx803" "gfx900" + "gfx90c" "gfx906" "gfx908" "gfx90a" From a22c2829e22da769cf8abb41d9c9456675ef9a46 Mon Sep 17 00:00:00 2001 From: Harkirat Gill Date: Tue, 10 Mar 2026 11:07:02 -0400 Subject: [PATCH 2/2] Update tensile-create-library-cli.rst --- .../docs/src/cli-reference/tensile-create-library-cli.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/shared/tensile/docs/src/cli-reference/tensile-create-library-cli.rst b/shared/tensile/docs/src/cli-reference/tensile-create-library-cli.rst index 0bd08251f8a..2f3b59d75d7 100644 --- a/shared/tensile/docs/src/cli-reference/tensile-create-library-cli.rst +++ b/shared/tensile/docs/src/cli-reference/tensile-create-library-cli.rst @@ -53,7 +53,7 @@ Here is the list of optional arguments for invoking the ``TensileCreateLibrary`` * - \-\-architecture=ARCHITECTURE - Architectures to generate a library for. When specifying multiple options, use quoted and semicolon-delimited architectures such as \-\-architecture='gfx908;gfx1012'. - Supported architectures include: all; gfx000; gfx803; gfx900; gfx900:xnack-; gfx906; gfx906:xnack+; gfx906:xnack-; gfx908; gfx908:xnack+; + Supported architectures include: all; gfx000; gfx803; gfx900; gfx900:xnack-; gfx90c; gfx906; gfx906:xnack+; gfx906:xnack-; gfx908; gfx908:xnack+; gfx908:xnack-; gfx90a; gfx90a:xnack+; gfx90a:xnack-; gfx940; gfx940:xnack+; gfx940:xnack-; gfx941; gfx941:xnack+; gfx941:xnack-; gfx942; gfx942:xnack+; gfx942:xnack-; gfx1010; gfx1011; gfx1012; gfx1030; gfx1031; gfx1032; gfx1034; gfx1035; gfx1100; gfx1101; gfx1102; gfx1103; gfx1150; gfx1151; gfx1152; gfx1153; gfx1200; gfx1201.