4646#define CC_VOLTA 700
4747#define CC_TURING 750
4848#define CC_AMPERE 800
49+ #define CC_ADA_LOVELACE 890
4950#define CC_OFFSET_AMD 1000000
51+ #define CC_OFFSET_MTHREADS 0x0100000
5052#define CC_RDNA1 (CC_OFFSET_AMD + 1010 )
5153#define CC_RDNA2 (CC_OFFSET_AMD + 1030 )
5254#define CC_RDNA3 (CC_OFFSET_AMD + 1100 )
55+ #define GGML_CUDA_CC_IS_NVIDIA (cc ) (cc < CC_OFFSET_MTHREADS)
56+ #define GGML_CUDA_CC_IS_AMD (cc ) (cc >= CC_OFFSET_AMD)
5357
5458#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
5559
@@ -134,6 +138,49 @@ typedef float2 dfloat2;
134138#define INT8_MMA_AVAILABLE
135139#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
136140
141+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
142+ #define CP_ASYNC_AVAILABLE
143+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
144+
145+ #ifdef __CUDA_ARCH_LIST__
146+ constexpr bool ggml_cuda_has_arch_impl (int ) {
147+ return false ;
148+ }
149+
150+ template <class ... Archs>
151+ constexpr bool ggml_cuda_has_arch_impl (const int arch, const int first, Archs... rest) {
152+ return arch == first || ggml_cuda_has_arch_impl (arch, rest...);
153+ }
154+
155+ constexpr bool ggml_cuda_has_arch (const int arch) {
156+ return ggml_cuda_has_arch_impl (arch, __CUDA_ARCH_LIST__);
157+ }
158+
159+ constexpr int ggml_cuda_highest_compiled_arch_impl (const int arch, const int cur) {
160+ if (cur == 0 ) {
161+ GGML_ABORT (" ggml was not compiled with any CUDA arch <= %d" , arch);
162+ }
163+ return cur;
164+ }
165+
166+ template <class ... Archs>
167+ constexpr int ggml_cuda_highest_compiled_arch_impl (const int arch, const int cur, const int first, Archs... rest) {
168+ if (first <= arch && first > cur) {
169+ return ggml_cuda_highest_compiled_arch_impl (arch, first, rest...);
170+ } else {
171+ return ggml_cuda_highest_compiled_arch_impl (arch, cur, rest...);
172+ }
173+ }
174+
175+ constexpr int ggml_cuda_highest_compiled_arch (const int arch) {
176+ return ggml_cuda_highest_compiled_arch_impl (arch, 0 , __CUDA_ARCH_LIST__);
177+ }
178+ #else
179+ static int ggml_cuda_highest_compiled_arch (const int arch) {
180+ return arch;
181+ }
182+ #endif // __CUDA_ARCH_LIST__
183+
137184static constexpr bool fast_fp16_available (const int cc) {
138185 return cc >= CC_PASCAL && cc != 610 ;
139186}
@@ -146,6 +193,15 @@ static constexpr bool int8_mma_available(const int cc) {
146193 return cc < CC_OFFSET_AMD && cc >= CC_TURING;
147194}
148195
196+ // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
197+ static bool new_mma_available (const int cc) {
198+ return GGML_CUDA_CC_IS_NVIDIA (cc) && ggml_cuda_highest_compiled_arch (cc) >= CC_TURING;
199+ }
200+
201+ static bool cp_async_available (const int cc) {
202+ return cc < CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch (cc) >= CC_AMPERE;
203+ }
204+
149205[[noreturn]]
150206static __device__ void no_device_code (
151207 const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
0 commit comments