From 218573d38a6333b7caae6f2a09f837861a5423ff Mon Sep 17 00:00:00 2001 From: Flavio Teixeira Date: Mon, 3 Nov 2025 11:44:31 -0700 Subject: [PATCH 01/19] - Add ability to have per precision kernel configuration entries. --- .../src/device/generator/stockham_gen.cpp | 100 +++++------ .../src/device/generator/stockham_gen.h | 6 +- .../library/src/device/kernel-generator.py | 164 ++++++++++-------- .../kernels/configs/config_2d_single.py | 12 +- .../src/device/kernels/configs/config_sbrr.py | 26 +-- .../rocfft/library/src/rocfft_aot_helper.cpp | 4 +- .../src/rocfft_kernel_config_search.cpp | 2 +- .../library/src/rtc_stockham_kernel.cpp | 11 +- 8 files changed, 170 insertions(+), 155 deletions(-) diff --git a/projects/rocfft/library/src/device/generator/stockham_gen.cpp b/projects/rocfft/library/src/device/generator/stockham_gen.cpp index e7f921cd42f..16813c856bd 100644 --- a/projects/rocfft/library/src/device/generator/stockham_gen.cpp +++ b/projects/rocfft/library/src/device/generator/stockham_gen.cpp @@ -52,7 +52,7 @@ struct GeneratedLauncher const std::vector& pp_factors_other, const unsigned int& pp_current_dim, const unsigned int& pp_off_dim, - bool double_precision, + const unsigned int& precision_type, const std::string& sbrc_type, const std::string& sbrc_transpose_type) : scheme(scheme) @@ -69,7 +69,7 @@ struct GeneratedLauncher , direct_to_from_reg(kernel.direct_to_from_reg) , sbrc_type(sbrc_type) , sbrc_transpose_type(sbrc_transpose_type) - , double_precision(double_precision) + , precision_type(precision_type) { } @@ -90,7 +90,8 @@ struct GeneratedLauncher // SBRC transpose type std::string sbrc_type; std::string sbrc_transpose_type; - bool double_precision; + + unsigned int precision_type; // output a json object that the python generator can parse to know // how to build the function pool @@ -129,7 +130,7 @@ struct GeneratedLauncher add_member("direct_to_from_reg", direct_to_from_reg ? "true" : "false"); add_member("sbrc_type", quote_str(sbrc_type)); add_member("sbrc_transpose_type", quote_str(sbrc_transpose_type)); - add_member("double_precision", double_precision ? "true" : "false"); + add_member("precision_type", std::to_string(precision_type)); add_member("pp_child_scheme", quote_str(pp_child_scheme)); add_member("pp_factors_curr", vec_to_list(pp_factors_curr)); add_member("pp_factors_other", vec_to_list(pp_factors_other)); @@ -149,7 +150,7 @@ struct LaunchSuffix std::string sbrc_transpose_type; }; -void make_launcher(const std::vector& precision_types, +void make_launcher(const unsigned int& precision_type, const std::vector& launcher_suffixes, StockhamKernel& kernel, const std::string& pp_child_scheme, @@ -159,21 +160,19 @@ void make_launcher(const std::vector& precision_types, const unsigned int& pp_off_dim, std::vector& generated_launchers) { - for(auto precision_type : precision_types) + + for(auto&& launcher : launcher_suffixes) { - for(auto&& launcher : launcher_suffixes) - { - generated_launchers.emplace_back(kernel, - launcher.scheme, - pp_child_scheme, - pp_factors_curr, - pp_factors_other, - pp_current_dim, - pp_off_dim, - precision_type == rocfft_precision_double, - launcher.sbrc_type, - launcher.sbrc_transpose_type); - } + generated_launchers.emplace_back(kernel, + launcher.scheme, + pp_child_scheme, + pp_factors_curr, + pp_factors_other, + pp_current_dim, + pp_off_dim, + precision_type, + launcher.sbrc_type, + launcher.sbrc_transpose_type); } } @@ -255,7 +254,7 @@ void stockham_partial_pass_variants(const std::string& kernel_name if(params_1.current_dim == 0 && params_2.current_dim == 2) { StockhamPartialPassKernelRR kernelRR(specs1, params_1); - make_launcher(specs1.precisions, + make_launcher(specs1.precision, {{"pp_stoc", specs1.scheme, "", ""}}, kernelRR, "CS_KERNEL_STOCKHAM_PP", @@ -266,7 +265,7 @@ void stockham_partial_pass_variants(const std::string& kernel_name launchers); StockhamPartialPassKernelCC kernelCC(specs2, params_2, false); - make_launcher(specs2.precisions, + make_launcher(specs2.precision, {{"pp_sbcc", specs2.scheme, "", ""}}, kernelCC, "CS_KERNEL_STOCKHAM_PP_BLOCK_CC", @@ -279,7 +278,7 @@ void stockham_partial_pass_variants(const std::string& kernel_name else if(params_1.current_dim == 2 && params_2.current_dim == 0) { StockhamPartialPassKernelRR kernelCC(specs1, params_1); - make_launcher(specs1.precisions, + make_launcher(specs1.precision, {{"pp_sbcc", specs1.scheme, "", ""}}, kernelCC, "CS_KERNEL_STOCKHAM_PP_BLOCK_CC", @@ -290,7 +289,7 @@ void stockham_partial_pass_variants(const std::string& kernel_name launchers); StockhamPartialPassKernelCC kernelRR(specs2, params_2, false); - make_launcher(specs2.precisions, + make_launcher(specs2.precision, {{"pp_stoc", specs2.scheme, "", ""}}, kernelRR, "CS_KERNEL_STOCKHAM_PP", @@ -341,7 +340,7 @@ void stockham_variants(const std::string& kernel_name, if(specs.scheme == "CS_KERNEL_STOCKHAM") { StockhamKernelRR kernel(specs); - make_launcher(specs.precisions, + make_launcher(specs.precision, {{"stoc", specs.scheme, "", ""}}, kernel, "CS_NONE", @@ -354,7 +353,7 @@ void stockham_variants(const std::string& kernel_name, else if(specs.scheme == "CS_KERNEL_STOCKHAM_BLOCK_CC") { StockhamKernelCC kernel(specs, false, false); - make_launcher(specs.precisions, + make_launcher(specs.precision, {{"sbcc", specs.scheme, "", ""}}, kernel, "CS_NONE", @@ -405,7 +404,7 @@ void stockham_variants(const std::string& kernel_name, "SBRC_3D_FFT_ERC_TRANS_Z_XY", "TILE_UNALIGNED"}); - make_launcher(specs.precisions, + make_launcher(specs.precision, suffixes, kernel, "CS_NONE", @@ -419,7 +418,7 @@ void stockham_variants(const std::string& kernel_name, { StockhamKernelCR kernel(specs); - make_launcher(specs.precisions, + make_launcher(specs.precision, {{"sbcr", specs.scheme, "", ""}}, kernel, "CS_NONE", @@ -433,20 +432,16 @@ void stockham_variants(const std::string& kernel_name, { StockhamKernelFused2D fused2d(specs, specs2d); - // output 2D launchers - for(auto prec_type : specs.precisions) - { - launchers.emplace_back(fused2d, - specs.scheme, - "CS_NONE", - std::vector(), - std::vector(), - 0, - 0, - (prec_type == rocfft_precision_double), - "", - ""); - } + launchers.emplace_back(fused2d, + specs.scheme, + "CS_NONE", + std::vector(), + std::vector(), + 0, + 0, + specs.precision, + "", + ""); } else throw std::runtime_error("unhandled scheme"); @@ -454,15 +449,14 @@ void stockham_variants(const std::string& kernel_name, output_json(launchers, kernel_name, output); } -static size_t max_bytes_per_element(const std::vector& precisions) +static size_t max_bytes_per_element(const unsigned int& precision) { // generate for the maximum element size in the available // precisions size_t element_size = 0; - for(auto p : precisions) - { - element_size = std::max(element_size, complex_type_size(static_cast(p))); - } + element_size + = std::max(element_size, complex_type_size(static_cast(precision))); + return element_size; } @@ -698,8 +692,8 @@ int main() threads_per_transform = parse_uints_csv(*arg); ++arg; - std::vector precisions; - precisions = parse_uints_csv(*arg); + unsigned int precision; + precision = std::stoul(*arg); // create spec and pass to stockham_variants, writes partial output to stdout std::cout << DELIM; @@ -733,12 +727,12 @@ int main() if(direct_to_from_reg.size() != 2) throw std::runtime_error("CS_3D_PP requires two direct_to_from_reg configuration"); - StockhamGeneratorSpecs specs1(factors1, {}, precisions, workgroup_size[0], scheme); + StockhamGeneratorSpecs specs1(factors1, {}, precision, workgroup_size[0], scheme); specs1.direct_to_from_reg = direct_to_from_reg[0]; specs1.threads_per_transform = threads_per_transform[0]; specs1.wgs_is_derived = true; - StockhamGeneratorSpecs specs2(factors2, {}, precisions, workgroup_size[1], scheme); + StockhamGeneratorSpecs specs2(factors2, {}, precision, workgroup_size[1], scheme); specs2.direct_to_from_reg = direct_to_from_reg[1]; specs2.threads_per_transform = threads_per_transform[1]; specs2.wgs_is_derived = true; @@ -769,17 +763,17 @@ int main() ++arg; factors = parse_uints_csv(*arg); - StockhamGeneratorSpecs specs(factors, factors2d, precisions, workgroup_size[0], scheme); + StockhamGeneratorSpecs specs(factors, factors2d, precision, workgroup_size[0], scheme); specs.half_lds = half_lds; specs.direct_to_from_reg = direct_to_from_reg[0]; - specs.bytes_per_element = max_bytes_per_element(precisions); + specs.bytes_per_element = max_bytes_per_element(precision); specs.threads_per_transform = threads_per_transform.front(); // second dimension for 2D_SINGLE StockhamGeneratorSpecs specs2d( - factors2d, factors, precisions, workgroup_size[0], scheme); + factors2d, factors, precision, workgroup_size[0], scheme); if(!threads_per_transform.empty()) specs2d.threads_per_transform = threads_per_transform.back(); diff --git a/projects/rocfft/library/src/device/generator/stockham_gen.h b/projects/rocfft/library/src/device/generator/stockham_gen.h index b71cb57e3e9..b3907ae114f 100644 --- a/projects/rocfft/library/src/device/generator/stockham_gen.h +++ b/projects/rocfft/library/src/device/generator/stockham_gen.h @@ -32,12 +32,12 @@ struct StockhamGeneratorSpecs { StockhamGeneratorSpecs(const std::vector& factors, const std::vector& factors2d, - const std::vector& precisions, + unsigned int precision, unsigned int workgroup_size, const std::string& scheme) : factors(factors) , factors2d(factors2d) - , precisions(precisions) + , precision(precision) , length(product(factors.begin(), factors.end())) , length2d(product(factors2d.begin(), factors2d.end())) , workgroup_size(workgroup_size) @@ -47,7 +47,7 @@ struct StockhamGeneratorSpecs std::vector factors; std::vector factors2d; - std::vector precisions; // mapped from rocfft_precision + unsigned int precision; // mapped from rocfft_precision unsigned int length; unsigned int length2d = 0; diff --git a/projects/rocfft/library/src/device/kernel-generator.py b/projects/rocfft/library/src/device/kernel-generator.py index b2b943b0106..6c336cc1b4d 100644 --- a/projects/rocfft/library/src/device/kernel-generator.py +++ b/projects/rocfft/library/src/device/kernel-generator.py @@ -32,6 +32,7 @@ import sys import json import threading +import copy import kernels.configs.config_sbrc as config_sbrc import kernels.configs.config_sbrr as config_sbrr @@ -40,7 +41,6 @@ import kernels.configs.config_2d_single as config_2d_single import kernels.configs.config_pp_3d as config_pp_3d -from copy import deepcopy from pathlib import Path from types import SimpleNamespace as NS from operator import mul @@ -85,17 +85,39 @@ def cjoin(xs): # # Helpers # -def unique(kernels): - """Merge kernel lists without duplicated meta.length; ignore later ones.""" +def merge_kernel_list(kernels, all_precisions): + """Merge precision list with kernel list. Check for duplicated kernel and invalid precision entries.""" r, s = list(), set() for kernel in kernels: - if isinstance(kernel.length, list): - key = tuple(kernel.length) + (kernel.scheme, kernel.lds_size_bytes) + if hasattr(kernel, 'precision'): + precisions = kernel.precision else: - key = (kernel.length, kernel.scheme, kernel.lds_size_bytes) - if key not in s: - s.add(key) - r.append(kernel) + precisions = all_precisions + + for p in precisions: + if p not in all_precisions: + print("Error: invalid precision in kernel configuration: \n" + + str(kernel)) + sys.exit(1) + + kernel_cpy = copy.copy(kernel) + kernel_cpy.precision = p + + if isinstance(kernel_cpy.length, list): + length_key = tuple(kernel_cpy.length) + else: + length_key = kernel_cpy.length + + key = (length_key, kernel_cpy.scheme, kernel_cpy.precision, + kernel_cpy.lds_size_bytes) + + if key not in s: + s.add(key) + r.append(kernel_cpy) + else: + print("Error: duplicated entry in kernel configuration: " + + str(kernel)) + sys.exit(1) return r @@ -116,8 +138,8 @@ def __str__(self): f = 'FFTKernel(' use_3steps_large_twd = getattr(self.function.meta, 'use_3steps_large_twd', None) - # assume half-precision needs the same thing as single - precision = 'sp' if self.function.meta.precision == 'half' else self.function.meta.precision + # assume half-precision needs the same large twiddle table as single + precision = 'sp' if self.function.meta.precision == 'hp' else self.function.meta.precision if use_3steps_large_twd is not None: f += str(use_3steps_large_twd[precision]) else: @@ -191,7 +213,7 @@ def generate_cpu_function_pool_pieces(functions, pp_functions, num_files): precisions = { 'sp': 'rocfft_precision_single', 'dp': 'rocfft_precision_double', - 'half': 'rocfft_precision_half', + 'hp': 'rocfft_precision_half', } var_kernel = Variable('kernel', 'FFTKernel') var_pp_kernel_1 = Variable('pp_kernel_1', 'FFTKernel') @@ -248,8 +270,10 @@ def generate_cpu_function_pool_pieces(functions, pp_functions, num_files): and f_pp_1.meta.pp_current_dim != f_pp_2.meta.pp_current_dim): break - if (f_pp_1.meta.length != f_pp_2.meta.length): - # we hit a new kernel with different length + if (f_pp_1.meta.length != f_pp_2.meta.length or + (f_pp_1.meta.length == f_pp_2.meta.length + and f_pp_1.meta.precision != f_pp_2.meta.precision)): + # we hit a new kernel with different length/precision # start next iteration looking for the next pair counter_f_pp_1 = counter_f_pp_2 skip_to_next_iter = True @@ -317,6 +341,8 @@ def kernel_name(ns): elif ns.scheme == 'CS_KERNEL_STOCKHAM_BLOCK_CR': postfix = '_sbcr' + postfix += f'_{ns.precision}' + if hasattr(ns, 'lds_size_bytes'): postfix += f'_lds{ns.lds_size_bytes}' @@ -328,10 +354,8 @@ def list_small_kernels(): kernels1d = config_sbrr.sbrr_kernels kernels = [ - NS(**kernel.__dict__, - scheme='CS_KERNEL_STOCKHAM', - precision=['sp', 'dp'] if not hasattr(kernel, 'double_precision') - or kernel.double_precision else ['sp']) for kernel in kernels1d + NS(**kernel.__dict__, scheme='CS_KERNEL_STOCKHAM') + for kernel in kernels1d ] return kernels @@ -381,10 +405,7 @@ def list_2d_kernels(): expanded.extend( NS(**kernel.__dict__, scheme='CS_KERNEL_2D_SINGLE', - runtime_compile=True, - precision=['sp', 'dp'] if not hasattr(kernel, 'double_precision') - or kernel.double_precision else ['sp']) - for kernel in fused_2d_kernels) + runtime_compile=True) for kernel in fused_2d_kernels) return expanded @@ -412,7 +433,7 @@ def default_runtime_compile(kernels, default_val): ] -def generate_kernel_functions(kernels, precisions, launchers_json): +def generate_kernel_functions(precisions_type_dict, kernels, launchers_json): """Generate CPU functions used to populate function pool with each kernel in `kernels`, and its variations. """ @@ -449,7 +470,7 @@ def generate_kernel_functions(kernels, precisions, launchers_json): pp_current_dim = launcher.pp_current_dim pp_off_dim = launcher.pp_off_dim sbrc_transpose_type = launcher.sbrc_transpose_type - precision = 'dp' if launcher.double_precision else 'sp' + precision = precisions_type_dict[launcher.precision_type] runtime_compile = kernel.runtime_compile use_3steps_large_twd = getattr(kernel, 'use_3steps_large_twd', None) @@ -463,33 +484,29 @@ def generate_kernel_functions(kernels, precisions, launchers_json): threads_per_transform, 0 ] - precisions = [precision] - if precision == 'sp': - precisions.append('half') - for p in precisions: - f = Function(arguments=ArgumentList(data, back), - meta=NS(factors=factors, - length=length, - params=params, - precision=p, - runtime_compile=runtime_compile, - scheme=scheme, - workgroup_size=workgroup_size, - transforms_per_block=transforms_per_block, - threads_per_transform=tpt_list, - transpose=sbrc_transpose_type, - use_3steps_large_twd=use_3steps_large_twd, - lds_size_bytes=kernel.lds_size_bytes, - pp_child_scheme=pp_child_scheme, - pp_factors_curr=pp_factors_curr, - pp_factors_other=pp_factors_other, - pp_current_dim=pp_current_dim, - pp_off_dim=pp_off_dim)) - - if (scheme == 'CS_3D_PP'): - pp_kernel_functions.append(f) - else: - kernel_functions.append(f) + f = Function(arguments=ArgumentList(data, back), + meta=NS(factors=factors, + length=length, + params=params, + precision=precision, + runtime_compile=runtime_compile, + scheme=scheme, + workgroup_size=workgroup_size, + transforms_per_block=transforms_per_block, + threads_per_transform=tpt_list, + transpose=sbrc_transpose_type, + use_3steps_large_twd=use_3steps_large_twd, + lds_size_bytes=kernel.lds_size_bytes, + pp_child_scheme=pp_child_scheme, + pp_factors_curr=pp_factors_curr, + pp_factors_other=pp_factors_other, + pp_current_dim=pp_current_dim, + pp_off_dim=pp_off_dim)) + + if (scheme == 'CS_3D_PP'): + pp_kernel_functions.append(f) + else: + kernel_functions.append(f) return kernel_functions, pp_kernel_functions @@ -502,7 +519,7 @@ def read_subprocess(proc_output, output): output[0] = json_string -def generate_kernels(kernels, precisions, stockham_gen): +def generate_kernels(precisions_dict, kernels, stockham_gen): """Generate and write kernels from the kernel list. Entries in the kernel list are simple namespaces. These are @@ -511,8 +528,6 @@ def generate_kernels(kernels, precisions, stockham_gen): A list of CPU functions is returned. """ - pre_enum = {'sp': 0, 'dp': 1} - # run stockham_gen to retrieve JSON output via stdout, used for additional kernel details proc = subprocess.Popen(args=[stockham_gen], stdin=subprocess.PIPE, @@ -534,24 +549,21 @@ def generate_kernels(kernels, precisions, stockham_gen): if kernel_idx < num_kernels: k = kernels[kernel_idx] - kernel_precisions = k.precision if hasattr( - k, 'precision') else precisions - # 2D single kernels always specify threads per transform if isinstance(k.length, list): proc.stdin.write(','.join([str(f) for f in k.factors[0]]) + " ") - proc.stdin.write(','.join([str(f) - for f in k.factors[1]]) + " ") - proc.stdin.write( - ','.join([str(pre_enum[pre]) - for pre in kernel_precisions]) + " ") + proc.stdin.write(','.join([str(f) for f in k.factors[1]])) + + proc.stdin.write(f' {str(precisions_dict[k.precision])}' + " ") + proc.stdin.write(','.join( [str(f) for f in k.threads_per_transform])) else: - proc.stdin.write(','.join([str(f) for f in k.factors]) + " ") - proc.stdin.write(','.join( - [str(pre_enum[pre]) for pre in kernel_precisions])) + proc.stdin.write(','.join([str(f) for f in k.factors])) + + proc.stdin.write(f' {precisions_dict[k.precision]}') + # 1D kernels might not, and need to default to 'uwide' threads_per_transform = getattr( k, 'threads_per_transform', { @@ -620,7 +632,11 @@ def generate_kernels(kernels, precisions, stockham_gen): json_string = json_result[0] kernel_launchers = json.loads(json_string) - return generate_kernel_functions(kernels, precisions, kernel_launchers) + # Invert precisions dict for easy lookup + precisions_type_dict = {v: k for k, v in precisions_dict.items()} + + return generate_kernel_functions(precisions_type_dict, kernels, + kernel_launchers) def cli(): @@ -635,9 +651,6 @@ def cli(): type=int, help='Number of files to generate for parallel compilation.') - list_parser = subparsers.add_parser( - 'list', help='List names of kernels that will be generated.') - generate_parser = subparsers.add_parser('generate', help='Generate kernels.') generate_parser.add_argument('stockham_gen', @@ -649,7 +662,11 @@ def cli(): assert (args.num_files > 0), 'Number of files for function_pool should be positive' - precisions = ['dp', 'sp'] + # List of supported precisions to build + # sp: single-precision + # dp: double-precision + # hp: half-precision + precisions_dict = {'sp': 0, 'dp': 1, 'hp': 2} # # kernel list @@ -668,7 +685,10 @@ def cli(): if not hasattr(k, 'lds_size_bytes'): k.lds_size_bytes = 65536 - kernels = unique(kernels) + # + # merge kernel list with additional precision entries if required + # + kernels = merge_kernel_list(kernels, list(precisions_dict)) # # set runtime compile @@ -682,7 +702,7 @@ def cli(): # if args.command == 'generate': - functions, pp_functions = generate_kernels(kernels, precisions, + functions, pp_functions = generate_kernels(precisions_dict, kernels, args.stockham_gen) func_files = generate_cpu_function_pool_pieces(functions, pp_functions, args.num_files) diff --git a/projects/rocfft/library/src/device/kernels/configs/config_2d_single.py b/projects/rocfft/library/src/device/kernels/configs/config_2d_single.py index 181f63fb9aa..e9d1472650b 100644 --- a/projects/rocfft/library/src/device/kernels/configs/config_2d_single.py +++ b/projects/rocfft/library/src/device/kernels/configs/config_2d_single.py @@ -170,11 +170,11 @@ NS(length=[96,48], factors=[[4,6,4],[8,6]], threads_per_transform=[8,8], workgroup_size=256, lds_size_bytes=config_lds.LDS_160k), NS(length=[50,100], factors=[[10,5],[10,10]], threads_per_transform=[10,10], workgroup_size=250, lds_size_bytes=config_lds.LDS_160k), NS(length=[100,50], factors=[[10,10],[10,5]], threads_per_transform=[10,10], workgroup_size=250, lds_size_bytes=config_lds.LDS_160k), - NS(length=[50,96], factors=[[10,5],[4,6,4]], threads_per_transform=[10,16], double_precision=False, workgroup_size=240, lds_size_bytes=config_lds.LDS_160k), - NS(length=[54,108], factors=[[6,3,3],[6,3,6]], threads_per_transform=[6,12], double_precision=False, workgroup_size=243, lds_size_bytes=config_lds.LDS_160k), - NS(length=[108,54], factors=[[6,3,6],[6,3,3]], threads_per_transform=[12,6], double_precision=False, workgroup_size=243, lds_size_bytes=config_lds.LDS_160k), - NS(length=[64,128], factors=[[4,4,4],[4,8,4]], threads_per_transform=[8,16], double_precision=False, workgroup_size=512, lds_size_bytes=config_lds.LDS_160k), - NS(length=[128,64], factors=[[4,8,4],[4,4,4]], threads_per_transform=[16,8], double_precision=False, workgroup_size=512, lds_size_bytes=config_lds.LDS_160k), + NS(length=[50,96], factors=[[10,5],[4,6,4]], threads_per_transform=[10,16], precision=['sp', 'hp'], workgroup_size=240, lds_size_bytes=config_lds.LDS_160k), + NS(length=[54,108], factors=[[6,3,3],[6,3,6]], threads_per_transform=[6,12], precision=['sp', 'hp'], workgroup_size=243, lds_size_bytes=config_lds.LDS_160k), + NS(length=[108,54], factors=[[6,3,6],[6,3,3]], threads_per_transform=[12,6], precision=['sp', 'hp'], workgroup_size=243, lds_size_bytes=config_lds.LDS_160k), + NS(length=[64,128], factors=[[4,4,4],[4,8,4]], threads_per_transform=[8,16], precision=['sp', 'hp'], workgroup_size=512, lds_size_bytes=config_lds.LDS_160k), + NS(length=[128,64], factors=[[4,8,4],[4,4,4]], threads_per_transform=[16,8], precision=['sp', 'hp'], workgroup_size=512, lds_size_bytes=config_lds.LDS_160k), NS(length=[96,96], factors=[[4,6,4],[4,6,4]], threads_per_transform=[8,8], workgroup_size=256, lds_size_bytes=config_lds.LDS_160k), - NS(length=[100,100], factors=[[10,10],[10,10]], threads_per_transform=[10,10], double_precision=False, workgroup_size=500, lds_size_bytes=config_lds.LDS_160k), + NS(length=[100,100], factors=[[10,10],[10,10]], threads_per_transform=[10,10], precision=['sp', 'hp'], workgroup_size=500, lds_size_bytes=config_lds.LDS_160k), ] diff --git a/projects/rocfft/library/src/device/kernels/configs/config_sbrr.py b/projects/rocfft/library/src/device/kernels/configs/config_sbrr.py index 2d495a775f9..7d56eb61d6f 100644 --- a/projects/rocfft/library/src/device/kernels/configs/config_sbrr.py +++ b/projects/rocfft/library/src/device/kernels/configs/config_sbrr.py @@ -482,11 +482,11 @@ NS(length=4000, workgroup_size=256, threads_per_transform=200, factors=(10, 10, 10, 4), runtime_compile=True), NS(length=4050, workgroup_size=256, threads_per_transform=135, factors=(10, 5, 3, 3, 3, 3), runtime_compile=True), NS(length=4096, workgroup_size=256, threads_per_transform=256, factors=(16, 16, 16), runtime_compile=True), - NS(length=4704, workgroup_size=256, threads_per_transform=224, factors=(8, 4, 7, 7, 3), double_precision=False, runtime_compile=True), - NS(length=5488, workgroup_size=256, threads_per_transform=196, factors=(7, 4, 7, 4, 7), double_precision=False, runtime_compile=True), - NS(length=6144, workgroup_size=512, threads_per_transform=512, factors=(16, 4, 8, 3, 4), double_precision=False, runtime_compile=True), - NS(length=6561, workgroup_size=256, threads_per_transform=243, factors=(3, 3, 3, 3, 3, 3, 3, 3), double_precision=False, runtime_compile=True), - NS(length=8192, workgroup_size=512, threads_per_transform=512, factors=(16, 4, 4, 4, 8), double_precision=False, runtime_compile=True), + NS(length=4704, workgroup_size=256, threads_per_transform=224, factors=(8, 4, 7, 7, 3), precision=['sp', 'hp'], runtime_compile=True), + NS(length=5488, workgroup_size=256, threads_per_transform=196, factors=(7, 4, 7, 4, 7), precision=['sp', 'hp'], runtime_compile=True), + NS(length=6144, workgroup_size=512, threads_per_transform=512, factors=(16, 4, 8, 3, 4), precision=['sp', 'hp'], runtime_compile=True), + NS(length=6561, workgroup_size=256, threads_per_transform=243, factors=(3, 3, 3, 3, 3, 3, 3, 3), precision=['sp', 'hp'], runtime_compile=True), + NS(length=8192, workgroup_size=512, threads_per_transform=512, factors=(16, 4, 4, 4, 8), precision=['sp', 'hp'], runtime_compile=True), # configs for 160KiB LDS NS(length=4704, workgroup_size=256, threads_per_transform=224, factors=(8, 4, 7, 7, 3), lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), @@ -497,12 +497,12 @@ NS(length=9216, workgroup_size=512, threads_per_transform=512, factors=(4, 8, 4, 4, 3, 6), lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), NS(length=10000, workgroup_size=512, threads_per_transform=500, factors=(4, 5, 5, 10, 10), lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), NS(length=10240, workgroup_size=512, threads_per_transform=512, factors=(8, 4, 4, 4, 5, 4), lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=10752, workgroup_size=512, threads_per_transform=512, factors=(4, 16, 8, 7, 3), double_precision=False, lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=11200, workgroup_size=512, threads_per_transform=448, factors=(4, 7, 5, 16, 5), double_precision=False, lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=12288, workgroup_size=512, threads_per_transform=512, factors=(8, 8, 4, 6, 8), double_precision=False, lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=16384, workgroup_size=512, threads_per_transform=512, factors=(8, 16, 4, 8, 4), double_precision=False, lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=16807, workgroup_size=384, threads_per_transform=343, factors=(7, 7, 7, 7, 7), double_precision=False, lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=18816, workgroup_size=512, threads_per_transform=448, factors=(8, 8, 7, 7, 6), double_precision=False, lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=19200, workgroup_size=512, threads_per_transform=480, factors=(8, 10, 8, 5, 6), double_precision=False, lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=20480, workgroup_size=512, threads_per_transform=512, factors=(4, 4, 16, 10, 8), double_precision=False, lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), + NS(length=10752, workgroup_size=512, threads_per_transform=512, factors=(4, 16, 8, 7, 3), precision=['sp', 'hp'], lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), + NS(length=11200, workgroup_size=512, threads_per_transform=448, factors=(4, 7, 5, 16, 5), precision=['sp', 'hp'], lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), + NS(length=12288, workgroup_size=512, threads_per_transform=512, factors=(8, 8, 4, 6, 8), precision=['sp', 'hp'], lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), + NS(length=16384, workgroup_size=512, threads_per_transform=512, factors=(8, 16, 4, 8, 4), precision=['sp', 'hp'], lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), + NS(length=16807, workgroup_size=384, threads_per_transform=343, factors=(7, 7, 7, 7, 7), precision=['sp', 'hp'], lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), + NS(length=18816, workgroup_size=512, threads_per_transform=448, factors=(8, 8, 7, 7, 6), precision=['sp', 'hp'], lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), + NS(length=19200, workgroup_size=512, threads_per_transform=480, factors=(8, 10, 8, 5, 6), precision=['sp', 'hp'], lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), + NS(length=20480, workgroup_size=512, threads_per_transform=512, factors=(4, 4, 16, 10, 8), precision=['sp', 'hp'], lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), ] diff --git a/projects/rocfft/library/src/rocfft_aot_helper.cpp b/projects/rocfft/library/src/rocfft_aot_helper.cpp index 5468e211f1c..21c8f5657ce 100644 --- a/projects/rocfft/library/src/rocfft_aot_helper.cpp +++ b/projects/rocfft/library/src/rocfft_aot_helper.cpp @@ -245,7 +245,7 @@ void build_stockham_function_pool(CompileQueue& queue) StockhamGeneratorSpecs specs{factors, {}, - {static_cast(precision)}, + static_cast(precision), static_cast(i.second.workgroup_size), PrintScheme(scheme)}; specs.threads_per_transform = i.second.threads_per_transform[0]; @@ -663,7 +663,7 @@ void build_solution_kernels(CompileQueue& queue) StockhamGeneratorSpecs specs{factors, {}, - {static_cast(precision)}, + static_cast(precision), static_cast(config.workgroup_size), PrintScheme(scheme)}; specs.threads_per_transform = config.threads_per_transform[0]; diff --git a/projects/rocfft/library/src/rocfft_kernel_config_search.cpp b/projects/rocfft/library/src/rocfft_kernel_config_search.cpp index 7b6e4141252..619e83ff2f6 100644 --- a/projects/rocfft/library/src/rocfft_kernel_config_search.cpp +++ b/projects/rocfft/library/src/rocfft_kernel_config_search.cpp @@ -159,7 +159,7 @@ std::string test_kernel_src(const std::string& kernel_name, { StockhamGeneratorSpecs specs{factorization, {}, - {static_cast(rocfft_precision_single)}, + static_cast(rocfft_precision_single), wgs, PrintScheme(compute_scheme)}; diff --git a/projects/rocfft/library/src/rtc_stockham_kernel.cpp b/projects/rocfft/library/src/rtc_stockham_kernel.cpp index 543be5a551f..b60d2843e59 100644 --- a/projects/rocfft/library/src/rtc_stockham_kernel.cpp +++ b/projects/rocfft/library/src/rtc_stockham_kernel.cpp @@ -74,11 +74,11 @@ RTCKernel::RTCGenerator RTCKernelStockham::generate_from_node(const LeafNode& std::vector factors; std::copy(kernel->factors.begin(), kernel->factors.end(), std::back_inserter(factors)); - std::vector precisions = {static_cast(node.precision)}; + auto precision = static_cast(node.precision); specs.emplace(factors, std::vector(), - precisions, + precision, static_cast(kernel->workgroup_size), PrintScheme(node.scheme)); specs->threads_per_transform = kernel->threads_per_transform[0]; @@ -105,7 +105,8 @@ RTCKernel::RTCGenerator RTCKernelStockham::generate_from_node(const LeafNode& std::vector factors1d; std::vector factors2d; - std::vector precisions = {static_cast(node.precision)}; + + auto precision = static_cast(node.precision); // need to break down factors into first dim and second dim size_t len0_remain = node.length[0]; @@ -124,7 +125,7 @@ RTCKernel::RTCGenerator RTCKernelStockham::generate_from_node(const LeafNode& specs.emplace(factors1d, factors2d, - precisions, + precision, static_cast(kernel->workgroup_size), PrintScheme(node.scheme)); specs->threads_per_transform = kernel->threads_per_transform[0]; @@ -133,7 +134,7 @@ RTCKernel::RTCGenerator RTCKernelStockham::generate_from_node(const LeafNode& specs2d.emplace(factors2d, factors1d, - precisions, + precision, static_cast(kernel->workgroup_size), PrintScheme(node.scheme)); specs2d->threads_per_transform = kernel->threads_per_transform[1]; From e64821d707d5858b0d2f85acb75c2b49814eea5e Mon Sep 17 00:00:00 2001 From: Flavio Teixeira Date: Wed, 5 Nov 2025 16:44:18 -0700 Subject: [PATCH 02/19] - Change bytes_per_element calculation to match what was previously implemented. --- .../src/device/generator/stockham_gen.cpp | 17 ++---- .../library/src/device/kernel-generator.py | 55 ++++++++++++++++--- 2 files changed, 52 insertions(+), 20 deletions(-) diff --git a/projects/rocfft/library/src/device/generator/stockham_gen.cpp b/projects/rocfft/library/src/device/generator/stockham_gen.cpp index 16813c856bd..283e337f5e4 100644 --- a/projects/rocfft/library/src/device/generator/stockham_gen.cpp +++ b/projects/rocfft/library/src/device/generator/stockham_gen.cpp @@ -449,17 +449,6 @@ void stockham_variants(const std::string& kernel_name, output_json(launchers, kernel_name, output); } -static size_t max_bytes_per_element(const unsigned int& precision) -{ - // generate for the maximum element size in the available - // precisions - size_t element_size = 0; - element_size - = std::max(element_size, complex_type_size(static_cast(precision))); - - return element_size; -} - // ========================================================= // Partial pass parameters row-major ordering helpers. // Kernel configuration parameters for CS_3D_PP in @@ -633,6 +622,7 @@ int main() std::string scheme; bool half_lds; unsigned int lds_size_bytes; + unsigned int bytes_per_element; const char* DELIM = ""; std::cout << "{"; @@ -655,6 +645,9 @@ int main() lds_size_bytes = std::stoul(*arg); ++arg; + bytes_per_element = std::stoul(*arg); + ++arg; + std::string kernel_name = *arg; ++arg; @@ -767,7 +760,7 @@ int main() specs.half_lds = half_lds; specs.direct_to_from_reg = direct_to_from_reg[0]; - specs.bytes_per_element = max_bytes_per_element(precision); + specs.bytes_per_element = bytes_per_element; specs.threads_per_transform = threads_per_transform.front(); diff --git a/projects/rocfft/library/src/device/kernel-generator.py b/projects/rocfft/library/src/device/kernel-generator.py index 6c336cc1b4d..0e35e59dca3 100644 --- a/projects/rocfft/library/src/device/kernel-generator.py +++ b/projects/rocfft/library/src/device/kernel-generator.py @@ -85,6 +85,18 @@ def cjoin(xs): # # Helpers # +def get_kernel_key(kernel): + """Return a key tuple for a kernel based on (length, scheme, lds_size_bytes).""" + if isinstance(kernel.length, list): + length_key = tuple(kernel.length) + else: + length_key = kernel.length + + key = (length_key, kernel.scheme, kernel.lds_size_bytes) + + return key + + def merge_kernel_list(kernels, all_precisions): """Merge precision list with kernel list. Check for duplicated kernel and invalid precision entries.""" r, s = list(), set() @@ -103,13 +115,7 @@ def merge_kernel_list(kernels, all_precisions): kernel_cpy = copy.copy(kernel) kernel_cpy.precision = p - if isinstance(kernel_cpy.length, list): - length_key = tuple(kernel_cpy.length) - else: - length_key = kernel_cpy.length - - key = (length_key, kernel_cpy.scheme, kernel_cpy.precision, - kernel_cpy.lds_size_bytes) + key = (get_kernel_key(kernel_cpy), kernel_cpy.precision) if key not in s: s.add(key) @@ -121,6 +127,34 @@ def merge_kernel_list(kernels, all_precisions): return r +def set_bytes_per_element(kernels): + """Default behavior for bytes_per_element calculation is to use the highest precision + for a given kernel key (length, scheme, lds_size_bytes). bytes_per_element is used to + calculate transforms_per_block and workgroup_size later on. This behavior for tpb and + wgs calculation can be overridden by using wgs_is_derived = true in kernel config. + NOTE: Once all kernels are tuned by precision, this function can most likely go away. + """ + d = dict() + # add precision entries for a given (length, scheme, lds_size_bytes) key + for kernel in kernels: + key = get_kernel_key(kernel) + if key not in d: + d[key] = list() + d[key].append(kernel.precision) + + for kernel in kernels: + key = get_kernel_key(kernel) + precisions = d[key] + if 'dp' in precisions: + kernel.bytes_per_element = 16 + elif 'sp' in precisions: + kernel.bytes_per_element = 8 + elif 'hp' in precisions: + kernel.bytes_per_element = 4 + + return kernels + + def is_aot_rtc(meta): return not meta.runtime_compile @@ -613,6 +647,7 @@ def generate_kernels(precisions_dict, kernels, stockham_gen): proc.stdin.write(f' {k.scheme}') proc.stdin.write(f' {kernel_name(k)}') + proc.stdin.write(f' {k.bytes_per_element}') proc.stdin.write(f' {k.lds_size_bytes}') proc.stdin.write('\n') @@ -691,9 +726,13 @@ def cli(): kernels = merge_kernel_list(kernels, list(precisions_dict)) # - # set runtime compile + # set bytes_per_element based on highest precision for a given kernel key # + kernels = set_bytes_per_element(kernels) + # + # set runtime compile + # kernels = default_runtime_compile(kernels, args.runtime_compile_default == 'ON') From 25123011333b225034c9a69d1d9fbe5c2c992a26 Mon Sep 17 00:00:00 2001 From: Flavio Teixeira Date: Mon, 24 Nov 2025 16:58:41 -0700 Subject: [PATCH 03/19] - Add support for arch specific entries in the function pool. --- .../src/device/generator/stockham_gen.cpp | 32 +++++++- .../src/device/generator/stockham_gen.h | 3 + .../library/src/device/kernel-generator.py | 78 ++++++++++++------- .../kernels/configs/config_2d_single.py | 1 + .../src/device/kernels/configs/config_arch.py | 39 ++++++++++ .../device/kernels/configs/config_pp_3d.py | 1 + .../src/device/kernels/configs/config_sbcc.py | 1 + .../src/device/kernels/configs/config_sbcr.py | 1 + .../src/device/kernels/configs/config_sbrc.py | 1 + .../src/device/kernels/configs/config_sbrr.py | 1 + .../library/src/include/function_map_key.h | 57 ++++++++++---- .../library/src/include/function_pool.h | 10 ++- .../rocfft/library/src/rocfft_aot_helper.cpp | 2 + .../src/rocfft_kernel_config_search.cpp | 2 + .../library/src/rtc_stockham_kernel.cpp | 3 + projects/rocfft/shared/device_properties.h | 10 +++ 16 files changed, 194 insertions(+), 48 deletions(-) create mode 100644 projects/rocfft/library/src/device/kernels/configs/config_arch.py diff --git a/projects/rocfft/library/src/device/generator/stockham_gen.cpp b/projects/rocfft/library/src/device/generator/stockham_gen.cpp index 283e337f5e4..f7f3234fae4 100644 --- a/projects/rocfft/library/src/device/generator/stockham_gen.cpp +++ b/projects/rocfft/library/src/device/generator/stockham_gen.cpp @@ -53,6 +53,7 @@ struct GeneratedLauncher const unsigned int& pp_current_dim, const unsigned int& pp_off_dim, const unsigned int& precision_type, + const std::string& gcn_arch_name, const std::string& sbrc_type, const std::string& sbrc_transpose_type) : scheme(scheme) @@ -70,6 +71,8 @@ struct GeneratedLauncher , sbrc_type(sbrc_type) , sbrc_transpose_type(sbrc_transpose_type) , precision_type(precision_type) + , gcn_arch_name(gcn_arch_name) + { } @@ -93,6 +96,8 @@ struct GeneratedLauncher unsigned int precision_type; + std::string gcn_arch_name; + // output a json object that the python generator can parse to know // how to build the function pool std::string to_string() const @@ -131,6 +136,7 @@ struct GeneratedLauncher add_member("sbrc_type", quote_str(sbrc_type)); add_member("sbrc_transpose_type", quote_str(sbrc_transpose_type)); add_member("precision_type", std::to_string(precision_type)); + add_member("gcn_arch_name", quote_str(gcn_arch_name)); add_member("pp_child_scheme", quote_str(pp_child_scheme)); add_member("pp_factors_curr", vec_to_list(pp_factors_curr)); add_member("pp_factors_other", vec_to_list(pp_factors_other)); @@ -153,6 +159,7 @@ struct LaunchSuffix void make_launcher(const unsigned int& precision_type, const std::vector& launcher_suffixes, StockhamKernel& kernel, + const std::string& gcn_arch_name, const std::string& pp_child_scheme, const std::vector& pp_factors_curr, const std::vector& pp_factors_other, @@ -171,6 +178,7 @@ void make_launcher(const unsigned int& precision_type, pp_current_dim, pp_off_dim, precision_type, + gcn_arch_name, launcher.sbrc_type, launcher.sbrc_transpose_type); } @@ -257,6 +265,7 @@ void stockham_partial_pass_variants(const std::string& kernel_name make_launcher(specs1.precision, {{"pp_stoc", specs1.scheme, "", ""}}, kernelRR, + specs1.gcn_arch_name, "CS_KERNEL_STOCKHAM_PP", params_1.pp_factors_curr, params_1.pp_factors_other, @@ -268,6 +277,7 @@ void stockham_partial_pass_variants(const std::string& kernel_name make_launcher(specs2.precision, {{"pp_sbcc", specs2.scheme, "", ""}}, kernelCC, + specs2.gcn_arch_name, "CS_KERNEL_STOCKHAM_PP_BLOCK_CC", params_2.pp_factors_curr, params_2.pp_factors_other, @@ -281,6 +291,7 @@ void stockham_partial_pass_variants(const std::string& kernel_name make_launcher(specs1.precision, {{"pp_sbcc", specs1.scheme, "", ""}}, kernelCC, + specs1.gcn_arch_name, "CS_KERNEL_STOCKHAM_PP_BLOCK_CC", params_1.pp_factors_curr, params_1.pp_factors_other, @@ -292,6 +303,7 @@ void stockham_partial_pass_variants(const std::string& kernel_name make_launcher(specs2.precision, {{"pp_stoc", specs2.scheme, "", ""}}, kernelRR, + specs2.gcn_arch_name, "CS_KERNEL_STOCKHAM_PP", params_2.pp_factors_curr, params_2.pp_factors_other, @@ -343,6 +355,7 @@ void stockham_variants(const std::string& kernel_name, make_launcher(specs.precision, {{"stoc", specs.scheme, "", ""}}, kernel, + specs.gcn_arch_name, "CS_NONE", std::vector(), std::vector(), @@ -356,6 +369,7 @@ void stockham_variants(const std::string& kernel_name, make_launcher(specs.precision, {{"sbcc", specs.scheme, "", ""}}, kernel, + specs.gcn_arch_name, "CS_NONE", std::vector(), std::vector(), @@ -407,6 +421,7 @@ void stockham_variants(const std::string& kernel_name, make_launcher(specs.precision, suffixes, kernel, + specs.gcn_arch_name, "CS_NONE", std::vector(), std::vector(), @@ -421,6 +436,7 @@ void stockham_variants(const std::string& kernel_name, make_launcher(specs.precision, {{"sbcr", specs.scheme, "", ""}}, kernel, + specs.gcn_arch_name, "CS_NONE", std::vector(), std::vector(), @@ -440,6 +456,7 @@ void stockham_variants(const std::string& kernel_name, 0, 0, specs.precision, + specs.gcn_arch_name, "", ""); } @@ -619,6 +636,7 @@ int main() std::string line; std::string kernel_name; + std::string gcn_arch_name; std::string scheme; bool half_lds; unsigned int lds_size_bytes; @@ -684,6 +702,9 @@ int main() std::vector threads_per_transform; threads_per_transform = parse_uints_csv(*arg); + ++arg; + gcn_arch_name = *arg; + ++arg; unsigned int precision; precision = std::stoul(*arg); @@ -720,12 +741,14 @@ int main() if(direct_to_from_reg.size() != 2) throw std::runtime_error("CS_3D_PP requires two direct_to_from_reg configuration"); - StockhamGeneratorSpecs specs1(factors1, {}, precision, workgroup_size[0], scheme); + StockhamGeneratorSpecs specs1( + factors1, {}, precision, gcn_arch_name, workgroup_size[0], scheme); specs1.direct_to_from_reg = direct_to_from_reg[0]; specs1.threads_per_transform = threads_per_transform[0]; specs1.wgs_is_derived = true; - StockhamGeneratorSpecs specs2(factors2, {}, precision, workgroup_size[1], scheme); + StockhamGeneratorSpecs specs2( + factors2, {}, precision, gcn_arch_name, workgroup_size[1], scheme); specs2.direct_to_from_reg = direct_to_from_reg[1]; specs2.threads_per_transform = threads_per_transform[1]; specs2.wgs_is_derived = true; @@ -756,7 +779,8 @@ int main() ++arg; factors = parse_uints_csv(*arg); - StockhamGeneratorSpecs specs(factors, factors2d, precision, workgroup_size[0], scheme); + StockhamGeneratorSpecs specs( + factors, factors2d, precision, gcn_arch_name, workgroup_size[0], scheme); specs.half_lds = half_lds; specs.direct_to_from_reg = direct_to_from_reg[0]; @@ -766,7 +790,7 @@ int main() // second dimension for 2D_SINGLE StockhamGeneratorSpecs specs2d( - factors2d, factors, precision, workgroup_size[0], scheme); + factors2d, factors, precision, gcn_arch_name, workgroup_size[0], scheme); if(!threads_per_transform.empty()) specs2d.threads_per_transform = threads_per_transform.back(); diff --git a/projects/rocfft/library/src/device/generator/stockham_gen.h b/projects/rocfft/library/src/device/generator/stockham_gen.h index b3907ae114f..7fe259b5161 100644 --- a/projects/rocfft/library/src/device/generator/stockham_gen.h +++ b/projects/rocfft/library/src/device/generator/stockham_gen.h @@ -33,11 +33,13 @@ struct StockhamGeneratorSpecs StockhamGeneratorSpecs(const std::vector& factors, const std::vector& factors2d, unsigned int precision, + const std::string& gcn_arch_name, unsigned int workgroup_size, const std::string& scheme) : factors(factors) , factors2d(factors2d) , precision(precision) + , gcn_arch_name(gcn_arch_name) , length(product(factors.begin(), factors.end())) , length2d(product(factors2d.begin(), factors2d.end())) , workgroup_size(workgroup_size) @@ -48,6 +50,7 @@ struct StockhamGeneratorSpecs std::vector factors; std::vector factors2d; unsigned int precision; // mapped from rocfft_precision + std::string gcn_arch_name; unsigned int length; unsigned int length2d = 0; diff --git a/projects/rocfft/library/src/device/kernel-generator.py b/projects/rocfft/library/src/device/kernel-generator.py index 0e35e59dca3..d23253bd3a0 100644 --- a/projects/rocfft/library/src/device/kernel-generator.py +++ b/projects/rocfft/library/src/device/kernel-generator.py @@ -40,6 +40,7 @@ import kernels.configs.config_sbcr as config_sbcr import kernels.configs.config_2d_single as config_2d_single import kernels.configs.config_pp_3d as config_pp_3d +import kernels.configs.config_arch as config_arch from pathlib import Path from types import SimpleNamespace as NS @@ -98,32 +99,48 @@ def get_kernel_key(kernel): def merge_kernel_list(kernels, all_precisions): - """Merge precision list with kernel list. Check for duplicated kernel and invalid precision entries.""" + """Merge precision and architecture lists with kernel list. + Check for duplicated kernel and invalid precision/arch entries.""" r, s = list(), set() + all_archs = [member.value for member in config_arch.supported_arch] + for kernel in kernels: if hasattr(kernel, 'precision'): precisions = kernel.precision else: precisions = all_precisions - for p in precisions: - if p not in all_precisions: - print("Error: invalid precision in kernel configuration: \n" + - str(kernel)) - sys.exit(1) - - kernel_cpy = copy.copy(kernel) - kernel_cpy.precision = p - - key = (get_kernel_key(kernel_cpy), kernel_cpy.precision) - - if key not in s: - s.add(key) - r.append(kernel_cpy) - else: - print("Error: duplicated entry in kernel configuration: " + - str(kernel)) + if hasattr(kernel, 'gcn_arch_name'): + archs = [member.value for member in kernel.gcn_arch_name] + else: + archs = [config_arch.supported_arch.GFX_GENERIC.value] + for a in archs: + if a not in all_archs: + print( + "Error: invalid architecture in kernel configuration: \n" + + str(kernel)) sys.exit(1) + for p in precisions: + if p not in all_precisions: + print( + "Error: invalid precision in kernel configuration: \n" + + str(kernel)) + sys.exit(1) + + kernel_cpy = copy.copy(kernel) + kernel_cpy.precision = p + kernel_cpy.gcn_arch_name = a + + key = (get_kernel_key(kernel_cpy), kernel_cpy.precision, + kernel_cpy.gcn_arch_name) + + if key not in s: + s.add(key) + r.append(kernel_cpy) + else: + print("Error: duplicated entry in kernel configuration: " + + str(kernel)) + sys.exit(1) return r @@ -264,16 +281,17 @@ def generate_cpu_function_pool_pieces(functions, pp_functions, num_files): curr_func, curr_file = 0, 0 while curr_func < len(functions): f = functions[curr_func] - length, precision, scheme, transpose = f.meta.length, f.meta.precision, f.meta.scheme, f.meta.transpose + length, precision, arch, scheme, transpose = f.meta.length, f.meta.precision, f.meta.gcn_arch_name, f.meta.scheme, f.meta.transpose if isinstance(length, (int, str)): length = [length, 0] piece_contents[curr_file] += Assign(var_kernel, FFTKernel(f)) - key = Call( - name='FMKey', - arguments=ArgumentList(length[0], length[1], precisions[precision], - scheme, transpose or 'NONE', - 'kernel.get_kernel_config()')).inline() + key = Call(name='FMKey', + arguments=ArgumentList(length[0], length[1], + precisions[precision], scheme, + transpose or 'NONE', + 'kernel.get_kernel_config()', + ''.join(['"', arch, '"']))).inline() piece_contents[curr_file] += function_map.insert( key, var_kernel, 'std::get<0>(def_keys)', 'std::get<0>(function_maps)', f.meta.lds_size_bytes) @@ -335,7 +353,9 @@ def generate_cpu_function_pool_pieces(functions, pp_functions, num_files): length[0], length[1], length[2], precisions[precision], scheme, 'pp_kernel_1.get_kernel_config()', - 'pp_kernel_2.get_kernel_config()')).inline() + 'pp_kernel_2.get_kernel_config()', + ''.join(['"', f_pp_1.meta.gcn_arch_name, + '"']))).inline() piece_contents[curr_file] += function_map.insert_pp( key, var_pp_kernel_1, var_pp_kernel_2, 'std::get<1>(def_keys)', 'std::get<1>(function_maps)', f_pp_1.meta.lds_size_bytes) @@ -505,6 +525,7 @@ def generate_kernel_functions(precisions_type_dict, kernels, launchers_json): pp_off_dim = launcher.pp_off_dim sbrc_transpose_type = launcher.sbrc_transpose_type precision = precisions_type_dict[launcher.precision_type] + gcn_arch_name = launcher.gcn_arch_name runtime_compile = kernel.runtime_compile use_3steps_large_twd = getattr(kernel, 'use_3steps_large_twd', None) @@ -523,6 +544,7 @@ def generate_kernel_functions(precisions_type_dict, kernels, launchers_json): length=length, params=params, precision=precision, + gcn_arch_name=gcn_arch_name, runtime_compile=runtime_compile, scheme=scheme, workgroup_size=workgroup_size, @@ -589,7 +611,9 @@ def generate_kernels(precisions_dict, kernels, stockham_gen): for f in k.factors[0]]) + " ") proc.stdin.write(','.join([str(f) for f in k.factors[1]])) - proc.stdin.write(f' {str(precisions_dict[k.precision])}' + " ") + proc.stdin.write(f' {str(precisions_dict[k.precision])}') + + proc.stdin.write(f' {k.gcn_arch_name}' + " ") proc.stdin.write(','.join( [str(f) for f in k.threads_per_transform])) @@ -598,6 +622,8 @@ def generate_kernels(precisions_dict, kernels, stockham_gen): proc.stdin.write(f' {precisions_dict[k.precision]}') + proc.stdin.write(f' {k.gcn_arch_name}') + # 1D kernels might not, and need to default to 'uwide' threads_per_transform = getattr( k, 'threads_per_transform', { diff --git a/projects/rocfft/library/src/device/kernels/configs/config_2d_single.py b/projects/rocfft/library/src/device/kernels/configs/config_2d_single.py index e9d1472650b..d6e97408f40 100644 --- a/projects/rocfft/library/src/device/kernels/configs/config_2d_single.py +++ b/projects/rocfft/library/src/device/kernels/configs/config_2d_single.py @@ -19,6 +19,7 @@ # THE SOFTWARE. from kernels.configs import config_lds +from kernels.configs import config_arch from types import SimpleNamespace as NS # yapf: disable diff --git a/projects/rocfft/library/src/device/kernels/configs/config_arch.py b/projects/rocfft/library/src/device/kernels/configs/config_arch.py new file mode 100644 index 00000000000..5382047caac --- /dev/null +++ b/projects/rocfft/library/src/device/kernels/configs/config_arch.py @@ -0,0 +1,39 @@ +# Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +from enum import Enum + + +class supported_arch(Enum): + GFX_GENERIC = "gfx_generic" + GFX_803 = "gfx803" + GFX_900 = "gfx900" + GFX_906 = "gfx906" + GFX_908 = "gfx908" + GFX_90A = "gfx90a" + GFX_942 = "gfx942" + GFX_950 = "gfx950" + GFX_1030 = "gfx1030" + GFX_1100 = "gfx1100" + GFX_1101 = "gfx1101" + GFX_1102 = "gfx1102" + GFX_1151 = "gfx1151" + GFX_1200 = "gfx1200" + GFX_1201 = "gfx1201" diff --git a/projects/rocfft/library/src/device/kernels/configs/config_pp_3d.py b/projects/rocfft/library/src/device/kernels/configs/config_pp_3d.py index 7ce240a409e..006daa48d18 100644 --- a/projects/rocfft/library/src/device/kernels/configs/config_pp_3d.py +++ b/projects/rocfft/library/src/device/kernels/configs/config_pp_3d.py @@ -19,6 +19,7 @@ # THE SOFTWARE. from kernels.configs import config_lds +from kernels.configs import config_arch from types import SimpleNamespace as NS # yapf: disable diff --git a/projects/rocfft/library/src/device/kernels/configs/config_sbcc.py b/projects/rocfft/library/src/device/kernels/configs/config_sbcc.py index 6eeed173374..55aab410881 100644 --- a/projects/rocfft/library/src/device/kernels/configs/config_sbcc.py +++ b/projects/rocfft/library/src/device/kernels/configs/config_sbcc.py @@ -19,6 +19,7 @@ # THE SOFTWARE. from kernels.configs import config_lds +from kernels.configs import config_arch from types import SimpleNamespace as NS # Note: Default direct_to_from_reg is True diff --git a/projects/rocfft/library/src/device/kernels/configs/config_sbcr.py b/projects/rocfft/library/src/device/kernels/configs/config_sbcr.py index 625d2a1d217..fa077493ba4 100644 --- a/projects/rocfft/library/src/device/kernels/configs/config_sbcr.py +++ b/projects/rocfft/library/src/device/kernels/configs/config_sbcr.py @@ -19,6 +19,7 @@ # THE SOFTWARE. from kernels.configs import config_lds +from kernels.configs import config_arch from types import SimpleNamespace as NS # NB: diff --git a/projects/rocfft/library/src/device/kernels/configs/config_sbrc.py b/projects/rocfft/library/src/device/kernels/configs/config_sbrc.py index 38ce5f0f37c..b16c7c80ec2 100644 --- a/projects/rocfft/library/src/device/kernels/configs/config_sbrc.py +++ b/projects/rocfft/library/src/device/kernels/configs/config_sbrc.py @@ -19,6 +19,7 @@ # THE SOFTWARE. from kernels.configs import config_lds +from kernels.configs import config_arch from types import SimpleNamespace as NS # for SBRC, if direct_to_from_reg is True, we do store-from-reg, but will not do load-to-reg diff --git a/projects/rocfft/library/src/device/kernels/configs/config_sbrr.py b/projects/rocfft/library/src/device/kernels/configs/config_sbrr.py index 7d56eb61d6f..b0aa87a8adb 100644 --- a/projects/rocfft/library/src/device/kernels/configs/config_sbrr.py +++ b/projects/rocfft/library/src/device/kernels/configs/config_sbrr.py @@ -19,6 +19,7 @@ # THE SOFTWARE. from kernels.configs import config_lds +from kernels.configs import config_arch from types import SimpleNamespace as NS # Note: Default half_lds is True and default direct_to_from_reg is True as well. diff --git a/projects/rocfft/library/src/include/function_map_key.h b/projects/rocfft/library/src/include/function_map_key.h index bf85c67a54c..3065a153d19 100644 --- a/projects/rocfft/library/src/include/function_map_key.h +++ b/projects/rocfft/library/src/include/function_map_key.h @@ -289,10 +289,12 @@ struct FMKeyBase { FMKeyBase(std::array lengths, rocfft_precision precision, - ComputeScheme scheme = CS_NONE) + ComputeScheme scheme = CS_NONE, + std::string gcn_arch_name = get_curr_gcn_arch_name()) : lengths(lengths) , precision(precision) , scheme(scheme) + , gcn_arch_name(gcn_arch_name) { } @@ -310,6 +312,7 @@ struct FMKeyBase rocfft_precision precision; ComputeScheme scheme; + std::string gcn_arch_name; }; // length, precision, scheme are theose fundemantal information of a kernel; @@ -340,8 +343,9 @@ struct FMKey : public FMKeyBase rocfft_precision precision, ComputeScheme scheme = CS_KERNEL_STOCKHAM, SBRC_TRANSPOSE_TYPE transpose = NONE, - KernelConfig kernel_config = KernelConfig::EmptyConfig()) - : FMKeyBase({length0, 0, 0}, precision, scheme) + KernelConfig kernel_config = KernelConfig::EmptyConfig(), + std::string gcn_arch_name = get_curr_gcn_arch_name()) + : FMKeyBase({length0, 0, 0}, precision, scheme, gcn_arch_name) , sbrcTrans(transpose) , kernel_config(kernel_config) @@ -354,8 +358,9 @@ struct FMKey : public FMKeyBase rocfft_precision precision, ComputeScheme scheme = CS_KERNEL_2D_SINGLE, SBRC_TRANSPOSE_TYPE transpose = NONE, - KernelConfig kernel_config = KernelConfig::EmptyConfig()) - : FMKeyBase({length0, length1, 0}, precision, scheme) + KernelConfig kernel_config = KernelConfig::EmptyConfig(), + std::string gcn_arch_name = get_curr_gcn_arch_name()) + : FMKeyBase({length0, length1, 0}, precision, scheme, gcn_arch_name) , sbrcTrans(transpose) , kernel_config(kernel_config) { @@ -365,9 +370,13 @@ struct FMKey : public FMKeyBase bool operator==(const FMKey& rhs) const { - return std::tie(lengths, precision, scheme, sbrcTrans, kernel_config) - == std::tie( - rhs.lengths, rhs.precision, rhs.scheme, rhs.sbrcTrans, rhs.kernel_config); + return std::tie(lengths, precision, scheme, sbrcTrans, kernel_config, gcn_arch_name) + == std::tie(rhs.lengths, + rhs.precision, + rhs.scheme, + rhs.sbrcTrans, + rhs.kernel_config, + rhs.gcn_arch_name); } bool operator!=(const FMKey& rhs) const @@ -377,8 +386,13 @@ struct FMKey : public FMKeyBase bool operator<(const FMKey& rhs) const { - return std::tie(lengths, precision, scheme, sbrcTrans, kernel_config) - < std::tie(rhs.lengths, rhs.precision, rhs.scheme, rhs.sbrcTrans, rhs.kernel_config); + return std::tie(lengths, precision, scheme, sbrcTrans, kernel_config, gcn_arch_name) + < std::tie(rhs.lengths, + rhs.precision, + rhs.scheme, + rhs.sbrcTrans, + rhs.kernel_config, + rhs.gcn_arch_name); } static FMKey EmptyFMKey() @@ -434,7 +448,8 @@ struct ToString str += FieldDescriptor().describe("sbrc_trans", PrintSBRCTransposeType(value.sbrcTrans)) + ","; - str += FieldDescriptor().describe("kernelConfig", value.kernel_config); + str += FieldDescriptor().describe("kernelConfig", value.kernel_config) + ","; + str += FieldDescriptor().describe("gcn_arch_name", value.gcn_arch_name); str += "}"; return str; } @@ -448,18 +463,21 @@ struct FromString std::vector len; std::string precStr, schemeStr, sbrcTransStr; KernelConfig config; + std::string gcn_arch_name; VectorFieldParser().parse("lengths", len, current); FieldParser().parse("precision", precStr, current); FieldParser().parse("scheme", schemeStr, current); FieldParser().parse("sbrc_trans", sbrcTransStr, current); FieldParser().parse("kernelConfig", config, current); + FieldParser().parse("gcn_arch_name", gcn_arch_name, current); ret.lengths = {len[0], len[1]}; ret.precision = StrToPrecision(precStr); ret.scheme = StrToComputeScheme(schemeStr); ret.sbrcTrans = StrToSBRCTransType(sbrcTransStr); ret.kernel_config = config; + ret.gcn_arch_name = gcn_arch_name; } }; @@ -475,6 +493,7 @@ struct SimpleHash h ^= std::hash{}(p.scheme); h ^= std::hash{}(p.sbrcTrans); h ^= std::hash{}(p.kernel_config); + h ^= std::hash{}(p.gcn_arch_name); return h; } @@ -503,8 +522,9 @@ struct PPFMKey : public FMKeyBase rocfft_precision precision, ComputeScheme scheme = CS_3D_PP, KernelConfig kernel_config_1 = KernelConfig::EmptyConfig(), - KernelConfig kernel_config_2 = KernelConfig::EmptyConfig()) - : FMKeyBase({length0, length1, length2}, precision, scheme) + KernelConfig kernel_config_2 = KernelConfig::EmptyConfig(), + std::string gcn_arch_name = get_curr_gcn_arch_name()) + : FMKeyBase({length0, length1, length2}, precision, scheme, gcn_arch_name) , kernel_config_1(kernel_config_1) , kernel_config_2(kernel_config_2) { @@ -514,12 +534,13 @@ struct PPFMKey : public FMKeyBase bool operator==(const PPFMKey& rhs) const { - return std::tie(lengths, precision, scheme, kernel_config_1, kernel_config_2) + return std::tie(lengths, precision, scheme, kernel_config_1, kernel_config_2, gcn_arch_name) == std::tie(rhs.lengths, rhs.precision, rhs.scheme, rhs.kernel_config_1, - rhs.kernel_config_2); + rhs.kernel_config_2, + rhs.gcn_arch_name); } bool operator!=(const PPFMKey& rhs) const @@ -529,12 +550,13 @@ struct PPFMKey : public FMKeyBase bool operator<(const PPFMKey& rhs) const { - return std::tie(lengths, precision, scheme, kernel_config_1, kernel_config_2) + return std::tie(lengths, precision, scheme, kernel_config_1, kernel_config_2, gcn_arch_name) < std::tie(rhs.lengths, rhs.precision, rhs.scheme, rhs.kernel_config_1, - rhs.kernel_config_2); + rhs.kernel_config_2, + rhs.gcn_arch_name); } static PPFMKey EmptyPPFMKey() @@ -562,6 +584,7 @@ struct SimpleHashPP h ^= std::hash{}(p.scheme); h ^= std::hash{}(p.kernel_config_1); h ^= std::hash{}(p.kernel_config_2); + h ^= std::hash{}(p.gcn_arch_name); return h; } diff --git a/projects/rocfft/library/src/include/function_pool.h b/projects/rocfft/library/src/include/function_pool.h index 14ed7d063e9..b5e0c840cf5 100644 --- a/projects/rocfft/library/src/include/function_pool.h +++ b/projects/rocfft/library/src/include/function_pool.h @@ -24,6 +24,7 @@ #define FUNCTION_POOL_H #include "../../../shared/arithmetic.h" +#include "../../../shared/device_properties.h" #include "../../../shared/rocfft_complex.h" #include "../device/kernels/common.h" #include "function_map_key.h" @@ -38,7 +39,8 @@ inline std::string PrintMissingKernelInfoBase(const FMKeyBase& key) msg << "Kernel not found: \n" << "\tlength: " << key.lengths[0] << "," << key.lengths[1] << "\n" << "\tprecision: " << key.precision << "\n" - << "\tscheme: " << PrintScheme(key.scheme) << "\n"; + << "\tscheme: " << PrintScheme(key.scheme) << "\n" + << "\tGCN Arch Name: " << key.gcn_arch_name << "\n"; return msg.str(); } @@ -398,6 +400,9 @@ static void insert_default_entry(const FMKey& def_key, FMKey def_key_with_lds = def_key; def_key_with_lds.lds_size_bytes = lds_size_bytes; + if(def_key_with_lds.gcn_arch_name == generic_gcn_arch_name) + def_key_with_lds.gcn_arch_name = get_curr_gcn_arch_name(); + // simple_key means the same thing as def_key, but we just remove kernel-config // so we don't need to know the exact config when we're lookin' for the default kernel FMKey simple_key{def_key_with_lds}; @@ -419,6 +424,9 @@ static void insert_default_entry(const PPFMKey& def_key, PPFMKey def_key_with_lds = def_key; def_key_with_lds.lds_size_bytes = lds_size_bytes; + if(def_key_with_lds.gcn_arch_name == generic_gcn_arch_name) + def_key_with_lds.gcn_arch_name = get_curr_gcn_arch_name(); + PPFMKey simple_key(def_key_with_lds); simple_key.kernel_config_1 = KernelConfig::EmptyConfig(); diff --git a/projects/rocfft/library/src/rocfft_aot_helper.cpp b/projects/rocfft/library/src/rocfft_aot_helper.cpp index 21c8f5657ce..300baebc16c 100644 --- a/projects/rocfft/library/src/rocfft_aot_helper.cpp +++ b/projects/rocfft/library/src/rocfft_aot_helper.cpp @@ -246,6 +246,7 @@ void build_stockham_function_pool(CompileQueue& queue) StockhamGeneratorSpecs specs{factors, {}, static_cast(precision), + get_curr_gcn_arch_name(), static_cast(i.second.workgroup_size), PrintScheme(scheme)}; specs.threads_per_transform = i.second.threads_per_transform[0]; @@ -664,6 +665,7 @@ void build_solution_kernels(CompileQueue& queue) StockhamGeneratorSpecs specs{factors, {}, static_cast(precision), + get_curr_gcn_arch_name(), static_cast(config.workgroup_size), PrintScheme(scheme)}; specs.threads_per_transform = config.threads_per_transform[0]; diff --git a/projects/rocfft/library/src/rocfft_kernel_config_search.cpp b/projects/rocfft/library/src/rocfft_kernel_config_search.cpp index 619e83ff2f6..5fa68d3ef78 100644 --- a/projects/rocfft/library/src/rocfft_kernel_config_search.cpp +++ b/projects/rocfft/library/src/rocfft_kernel_config_search.cpp @@ -31,6 +31,7 @@ #include "../../shared/CLI11.hpp" #include "../../shared/arithmetic.h" +#include "../../shared/device_properties.h" #include "../../shared/gpubuf.h" #include "../../shared/hip_object_wrapper.h" #include "device/generator/stockham_gen.h" @@ -160,6 +161,7 @@ std::string test_kernel_src(const std::string& kernel_name, StockhamGeneratorSpecs specs{factorization, {}, static_cast(rocfft_precision_single), + get_curr_gcn_arch_name(), wgs, PrintScheme(compute_scheme)}; diff --git a/projects/rocfft/library/src/rtc_stockham_kernel.cpp b/projects/rocfft/library/src/rtc_stockham_kernel.cpp index b60d2843e59..ad4e5be59f6 100644 --- a/projects/rocfft/library/src/rtc_stockham_kernel.cpp +++ b/projects/rocfft/library/src/rtc_stockham_kernel.cpp @@ -79,6 +79,7 @@ RTCKernel::RTCGenerator RTCKernelStockham::generate_from_node(const LeafNode& specs.emplace(factors, std::vector(), precision, + get_curr_gcn_arch_name(), static_cast(kernel->workgroup_size), PrintScheme(node.scheme)); specs->threads_per_transform = kernel->threads_per_transform[0]; @@ -126,6 +127,7 @@ RTCKernel::RTCGenerator RTCKernelStockham::generate_from_node(const LeafNode& specs.emplace(factors1d, factors2d, precision, + get_curr_gcn_arch_name(), static_cast(kernel->workgroup_size), PrintScheme(node.scheme)); specs->threads_per_transform = kernel->threads_per_transform[0]; @@ -135,6 +137,7 @@ RTCKernel::RTCGenerator RTCKernelStockham::generate_from_node(const LeafNode& specs2d.emplace(factors2d, factors1d, precision, + get_curr_gcn_arch_name(), static_cast(kernel->workgroup_size), PrintScheme(node.scheme)); specs2d->threads_per_transform = kernel->threads_per_transform[1]; diff --git a/projects/rocfft/shared/device_properties.h b/projects/rocfft/shared/device_properties.h index becc4c264e3..81bbe39b202 100644 --- a/projects/rocfft/shared/device_properties.h +++ b/projects/rocfft/shared/device_properties.h @@ -26,6 +26,10 @@ #include #include +// Generic GCN arch name for configuration entries that +// do not report a specific name in kernel_generator.py +static const std::string generic_gcn_arch_name("gfx_generic"); + // get device properties static hipDeviceProp_t get_curr_device_prop() { @@ -41,6 +45,12 @@ static hipDeviceProp_t get_curr_device_prop() return prop; } +// get device GCN arch name +static std::string get_curr_gcn_arch_name() +{ + return get_curr_device_prop().gcnArchName; +} + // check that the given grid/block dims will fit into the limits in // the device properties. throws std::runtime_error if the limits // are exceeded. From 15761c21dd64dc6271bbf91184122f181db79fe4 Mon Sep 17 00:00:00 2001 From: Flavio Teixeira Date: Thu, 27 Nov 2025 11:57:25 -0700 Subject: [PATCH 04/19] - Further changes for arch support in the function pool. --- projects/rocfft/library/src/CMakeLists.txt | 2 +- .../src/device/generator/stockham_gen.cpp | 8 ++- .../library/src/device/kernel-generator.py | 61 +++++++++++++------ .../kernels/configs/config_2d_single.py | 36 +++++------ .../src/device/kernels/configs/config_arch.py | 5 ++ .../src/device/kernels/configs/config_lds.py | 21 ------- .../device/kernels/configs/config_pp_3d.py | 4 +- .../src/device/kernels/configs/config_sbcc.py | 4 +- .../src/device/kernels/configs/config_sbcr.py | 4 +- .../src/device/kernels/configs/config_sbrc.py | 4 +- .../src/device/kernels/configs/config_sbrr.py | 36 +++++------ .../library/src/include/function_pool.h | 56 +++++++++-------- 12 files changed, 129 insertions(+), 112 deletions(-) delete mode 100644 projects/rocfft/library/src/device/kernels/configs/config_lds.py diff --git a/projects/rocfft/library/src/CMakeLists.txt b/projects/rocfft/library/src/CMakeLists.txt index f7782f7029f..bc332c42caf 100644 --- a/projects/rocfft/library/src/CMakeLists.txt +++ b/projects/rocfft/library/src/CMakeLists.txt @@ -151,7 +151,7 @@ set( kgen_logic_files # python code that decides kernel parameters ${CMAKE_SOURCE_DIR}/library/src/device/kernel-generator.py ${CMAKE_SOURCE_DIR}/library/src/device/generator.py - ${CMAKE_SOURCE_DIR}/library/src/device/kernels/configs/config_lds.py + ${CMAKE_SOURCE_DIR}/library/src/device/kernels/configs/config_arch.py ${CMAKE_SOURCE_DIR}/library/src/device/kernels/configs/config_sbrr.py ${CMAKE_SOURCE_DIR}/library/src/device/kernels/configs/config_sbcc.py ${CMAKE_SOURCE_DIR}/library/src/device/kernels/configs/config_sbcr.py diff --git a/projects/rocfft/library/src/device/generator/stockham_gen.cpp b/projects/rocfft/library/src/device/generator/stockham_gen.cpp index f7f3234fae4..8ca2d802975 100644 --- a/projects/rocfft/library/src/device/generator/stockham_gen.cpp +++ b/projects/rocfft/library/src/device/generator/stockham_gen.cpp @@ -22,6 +22,7 @@ using namespace std::placeholders; #include "../../../../shared/arithmetic.h" +#include "../../../../shared/device_properties.h" #include "../../../../shared/precision_type.h" #include "generator.h" #include "stockham_gen.h" @@ -796,8 +797,11 @@ int main() specs2d.threads_per_transform = threads_per_transform.back(); // 2D_SINGLE kernels use the specified workgroup size - // directly - if(scheme == "CS_KERNEL_2D_SINGLE") + // directly. + // Kernels with an architecture other than generic will + // also use the workgroup size directly, as the calculations + // for wgs_is_derived=false assume an LDS size of 64KiB. + if(scheme == "CS_KERNEL_2D_SINGLE" || gcn_arch_name != generic_gcn_arch_name) { specs.wgs_is_derived = true; specs2d.wgs_is_derived = true; diff --git a/projects/rocfft/library/src/device/kernel-generator.py b/projects/rocfft/library/src/device/kernel-generator.py index d23253bd3a0..80964b392ce 100644 --- a/projects/rocfft/library/src/device/kernel-generator.py +++ b/projects/rocfft/library/src/device/kernel-generator.py @@ -87,13 +87,14 @@ def cjoin(xs): # Helpers # def get_kernel_key(kernel): - """Return a key tuple for a kernel based on (length, scheme, lds_size_bytes).""" + """Return a key tuple for a kernel based on (length, scheme, lds_size_bytes, gcn_arch_name).""" if isinstance(kernel.length, list): length_key = tuple(kernel.length) else: length_key = kernel.length - key = (length_key, kernel.scheme, kernel.lds_size_bytes) + key = (length_key, kernel.scheme, kernel.lds_size_bytes, + kernel.gcn_arch_name) return key @@ -101,45 +102,72 @@ def get_kernel_key(kernel): def merge_kernel_list(kernels, all_precisions): """Merge precision and architecture lists with kernel list. Check for duplicated kernel and invalid precision/arch entries.""" + r, s = list(), set() + all_archs = [member.value for member in config_arch.supported_arch] + all_lds_configs = [member.value for member in config_arch.lds_config] + + lds_size_err_msg = "Error: invalid lds_size_bytes in kernel configuration: \n" + arch_err_msg = "Error: invalid architecture in kernel configuration: \n" + prec_err_msg = "Error: invalid precision in kernel configuration: \n" + dup_err_msg = "Error: duplicated entry in kernel configuration: \n" for kernel in kernels: if hasattr(kernel, 'precision'): + if not isinstance(kernel.precision, list): + kernel.precision = [kernel.precision] + precisions = kernel.precision else: precisions = all_precisions if hasattr(kernel, 'gcn_arch_name'): - archs = [member.value for member in kernel.gcn_arch_name] + if not isinstance(kernel.gcn_arch_name, list): + kernel.gcn_arch_name = [kernel.gcn_arch_name] + archs = kernel.gcn_arch_name + if hasattr(kernel, 'lds_size_bytes'): + # lds size not allowed to be specified per arch + print(lds_size_err_msg + str(kernel)) + sys.exit(1) + # lds size will be determined at runtime based on arch + kernel.lds_size_bytes = 0 else: + # default to gfx generic if no arch specified archs = [config_arch.supported_arch.GFX_GENERIC.value] + if hasattr(kernel, 'lds_size_bytes'): + if isinstance(kernel.lds_size_bytes, list): + # only one lds size allowed if gfx generic + print(lds_size_err_msg + str(kernel)) + sys.exit(1) + + if kernel.lds_size_bytes not in all_lds_configs: + print(lds_size_err_msg + str(kernel)) + sys.exit(1) + else: + # default lds size to 64KiB + kernel.lds_size_bytes = config_arch.lds_config.SIZE_64KiB.value + for a in archs: if a not in all_archs: - print( - "Error: invalid architecture in kernel configuration: \n" + - str(kernel)) + print(arch_err_msg + str(kernel)) sys.exit(1) for p in precisions: if p not in all_precisions: - print( - "Error: invalid precision in kernel configuration: \n" - + str(kernel)) + print(prec_err_msg + str(kernel)) sys.exit(1) kernel_cpy = copy.copy(kernel) kernel_cpy.precision = p kernel_cpy.gcn_arch_name = a - key = (get_kernel_key(kernel_cpy), kernel_cpy.precision, - kernel_cpy.gcn_arch_name) + key = (get_kernel_key(kernel_cpy), kernel_cpy.precision) if key not in s: s.add(key) r.append(kernel_cpy) else: - print("Error: duplicated entry in kernel configuration: " + - str(kernel)) + print(dup_err_msg + str(kernel)) sys.exit(1) return r @@ -741,13 +769,8 @@ def cli(): kernels += all_kernels + kernels_2d + kernel_3d_pp - # set default lds size (64k) on kernels if not specified - for k in kernels: - if not hasattr(k, 'lds_size_bytes'): - k.lds_size_bytes = 65536 - # - # merge kernel list with additional precision entries if required + # merge kernel list with additional precision/arch entries if required # kernels = merge_kernel_list(kernels, list(precisions_dict)) diff --git a/projects/rocfft/library/src/device/kernels/configs/config_2d_single.py b/projects/rocfft/library/src/device/kernels/configs/config_2d_single.py index d6e97408f40..963379b364e 100644 --- a/projects/rocfft/library/src/device/kernels/configs/config_2d_single.py +++ b/projects/rocfft/library/src/device/kernels/configs/config_2d_single.py @@ -18,8 +18,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. -from kernels.configs import config_lds -from kernels.configs import config_arch +from kernels.configs.config_arch import lds_config +from kernels.configs.config_arch import supported_arch from types import SimpleNamespace as NS # yapf: disable @@ -162,20 +162,20 @@ NS(length=[96,42], factors=[[6,16],[7,6]], threads_per_transform=[6,6], workgroup_size=576), # configs for 160KiB LDS - NS(length=[40,108], factors=[[8,5],[6,3,6]], threads_per_transform=[8,12], workgroup_size=240, lds_size_bytes=config_lds.LDS_160k), - NS(length=[108,40], factors=[[6,3,6],[8,5]], threads_per_transform=[12,8], workgroup_size=240, lds_size_bytes=config_lds.LDS_160k), - NS(length=[48,100], factors=[[8,6],[10,10]], threads_per_transform=[8,10], workgroup_size=240, lds_size_bytes=config_lds.LDS_160k), - NS(length=[100,48], factors=[[10,10],[8,6]], threads_per_transform=[10,8], workgroup_size=240, lds_size_bytes=config_lds.LDS_160k), - NS(length=[48,48], factors=[[8,6],[8,6]], threads_per_transform=[8,8], workgroup_size=256, lds_size_bytes=config_lds.LDS_160k), - NS(length=[48,96], factors=[[8,6],[4,6,4]], threads_per_transform=[8,8], workgroup_size=256, lds_size_bytes=config_lds.LDS_160k), - NS(length=[96,48], factors=[[4,6,4],[8,6]], threads_per_transform=[8,8], workgroup_size=256, lds_size_bytes=config_lds.LDS_160k), - NS(length=[50,100], factors=[[10,5],[10,10]], threads_per_transform=[10,10], workgroup_size=250, lds_size_bytes=config_lds.LDS_160k), - NS(length=[100,50], factors=[[10,10],[10,5]], threads_per_transform=[10,10], workgroup_size=250, lds_size_bytes=config_lds.LDS_160k), - NS(length=[50,96], factors=[[10,5],[4,6,4]], threads_per_transform=[10,16], precision=['sp', 'hp'], workgroup_size=240, lds_size_bytes=config_lds.LDS_160k), - NS(length=[54,108], factors=[[6,3,3],[6,3,6]], threads_per_transform=[6,12], precision=['sp', 'hp'], workgroup_size=243, lds_size_bytes=config_lds.LDS_160k), - NS(length=[108,54], factors=[[6,3,6],[6,3,3]], threads_per_transform=[12,6], precision=['sp', 'hp'], workgroup_size=243, lds_size_bytes=config_lds.LDS_160k), - NS(length=[64,128], factors=[[4,4,4],[4,8,4]], threads_per_transform=[8,16], precision=['sp', 'hp'], workgroup_size=512, lds_size_bytes=config_lds.LDS_160k), - NS(length=[128,64], factors=[[4,8,4],[4,4,4]], threads_per_transform=[16,8], precision=['sp', 'hp'], workgroup_size=512, lds_size_bytes=config_lds.LDS_160k), - NS(length=[96,96], factors=[[4,6,4],[4,6,4]], threads_per_transform=[8,8], workgroup_size=256, lds_size_bytes=config_lds.LDS_160k), - NS(length=[100,100], factors=[[10,10],[10,10]], threads_per_transform=[10,10], precision=['sp', 'hp'], workgroup_size=500, lds_size_bytes=config_lds.LDS_160k), + NS(length=[40,108], factors=[[8,5],[6,3,6]], threads_per_transform=[8,12], workgroup_size=240, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[108,40], factors=[[6,3,6],[8,5]], threads_per_transform=[12,8], workgroup_size=240, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[48,100], factors=[[8,6],[10,10]], threads_per_transform=[8,10], workgroup_size=240, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[100,48], factors=[[10,10],[8,6]], threads_per_transform=[10,8], workgroup_size=240, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[48,48], factors=[[8,6],[8,6]], threads_per_transform=[8,8], workgroup_size=256, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[48,96], factors=[[8,6],[4,6,4]], threads_per_transform=[8,8], workgroup_size=256, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[96,48], factors=[[4,6,4],[8,6]], threads_per_transform=[8,8], workgroup_size=256, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[50,100], factors=[[10,5],[10,10]], threads_per_transform=[10,10], workgroup_size=250, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[100,50], factors=[[10,10],[10,5]], threads_per_transform=[10,10], workgroup_size=250, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[50,96], factors=[[10,5],[4,6,4]], threads_per_transform=[10,16], precision=['sp', 'hp'], workgroup_size=240, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[54,108], factors=[[6,3,3],[6,3,6]], threads_per_transform=[6,12], precision=['sp', 'hp'], workgroup_size=243, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[108,54], factors=[[6,3,6],[6,3,3]], threads_per_transform=[12,6], precision=['sp', 'hp'], workgroup_size=243, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[64,128], factors=[[4,4,4],[4,8,4]], threads_per_transform=[8,16], precision=['sp', 'hp'], workgroup_size=512, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[128,64], factors=[[4,8,4],[4,4,4]], threads_per_transform=[16,8], precision=['sp', 'hp'], workgroup_size=512, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[96,96], factors=[[4,6,4],[4,6,4]], threads_per_transform=[8,8], workgroup_size=256, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[100,100], factors=[[10,10],[10,10]], threads_per_transform=[10,10], precision=['sp', 'hp'], workgroup_size=500, lds_size_bytes=lds_config.SIZE_160KiB.value), ] diff --git a/projects/rocfft/library/src/device/kernels/configs/config_arch.py b/projects/rocfft/library/src/device/kernels/configs/config_arch.py index 5382047caac..850c969c073 100644 --- a/projects/rocfft/library/src/device/kernels/configs/config_arch.py +++ b/projects/rocfft/library/src/device/kernels/configs/config_arch.py @@ -21,6 +21,11 @@ from enum import Enum +class lds_config(Enum): + SIZE_64KiB = 64 * 1024 + SIZE_160KiB = 160 * 1024 + + class supported_arch(Enum): GFX_GENERIC = "gfx_generic" GFX_803 = "gfx803" diff --git a/projects/rocfft/library/src/device/kernels/configs/config_lds.py b/projects/rocfft/library/src/device/kernels/configs/config_lds.py deleted file mode 100644 index 52dd1a558af..00000000000 --- a/projects/rocfft/library/src/device/kernels/configs/config_lds.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. - -LDS_160k = 160 * 1024 diff --git a/projects/rocfft/library/src/device/kernels/configs/config_pp_3d.py b/projects/rocfft/library/src/device/kernels/configs/config_pp_3d.py index 006daa48d18..54d619a33da 100644 --- a/projects/rocfft/library/src/device/kernels/configs/config_pp_3d.py +++ b/projects/rocfft/library/src/device/kernels/configs/config_pp_3d.py @@ -18,8 +18,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. -from kernels.configs import config_lds -from kernels.configs import config_arch +from kernels.configs.config_arch import lds_config +from kernels.configs.config_arch import supported_arch from types import SimpleNamespace as NS # yapf: disable diff --git a/projects/rocfft/library/src/device/kernels/configs/config_sbcc.py b/projects/rocfft/library/src/device/kernels/configs/config_sbcc.py index 55aab410881..3720ce793b2 100644 --- a/projects/rocfft/library/src/device/kernels/configs/config_sbcc.py +++ b/projects/rocfft/library/src/device/kernels/configs/config_sbcc.py @@ -18,8 +18,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. -from kernels.configs import config_lds -from kernels.configs import config_arch +from kernels.configs.config_arch import lds_config +from kernels.configs.config_arch import supported_arch from types import SimpleNamespace as NS # Note: Default direct_to_from_reg is True diff --git a/projects/rocfft/library/src/device/kernels/configs/config_sbcr.py b/projects/rocfft/library/src/device/kernels/configs/config_sbcr.py index fa077493ba4..c926bdb4f67 100644 --- a/projects/rocfft/library/src/device/kernels/configs/config_sbcr.py +++ b/projects/rocfft/library/src/device/kernels/configs/config_sbcr.py @@ -18,8 +18,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. -from kernels.configs import config_lds -from kernels.configs import config_arch +from kernels.configs.config_arch import lds_config +from kernels.configs.config_arch import supported_arch from types import SimpleNamespace as NS # NB: diff --git a/projects/rocfft/library/src/device/kernels/configs/config_sbrc.py b/projects/rocfft/library/src/device/kernels/configs/config_sbrc.py index b16c7c80ec2..bd53541ddeb 100644 --- a/projects/rocfft/library/src/device/kernels/configs/config_sbrc.py +++ b/projects/rocfft/library/src/device/kernels/configs/config_sbrc.py @@ -18,8 +18,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. -from kernels.configs import config_lds -from kernels.configs import config_arch +from kernels.configs.config_arch import lds_config +from kernels.configs.config_arch import supported_arch from types import SimpleNamespace as NS # for SBRC, if direct_to_from_reg is True, we do store-from-reg, but will not do load-to-reg diff --git a/projects/rocfft/library/src/device/kernels/configs/config_sbrr.py b/projects/rocfft/library/src/device/kernels/configs/config_sbrr.py index b0aa87a8adb..99521991615 100644 --- a/projects/rocfft/library/src/device/kernels/configs/config_sbrr.py +++ b/projects/rocfft/library/src/device/kernels/configs/config_sbrr.py @@ -18,8 +18,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. -from kernels.configs import config_lds -from kernels.configs import config_arch +from kernels.configs.config_arch import lds_config +from kernels.configs.config_arch import supported_arch from types import SimpleNamespace as NS # Note: Default half_lds is True and default direct_to_from_reg is True as well. @@ -490,20 +490,20 @@ NS(length=8192, workgroup_size=512, threads_per_transform=512, factors=(16, 4, 4, 4, 8), precision=['sp', 'hp'], runtime_compile=True), # configs for 160KiB LDS - NS(length=4704, workgroup_size=256, threads_per_transform=224, factors=(8, 4, 7, 7, 3), lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=5488, workgroup_size=256, threads_per_transform=196, factors=(7, 4, 7, 4, 7), lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=6144, workgroup_size=384, threads_per_transform=256, factors=(4, 8, 8, 8, 3), lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=6561, workgroup_size=256, threads_per_transform=243, factors=(3, 3, 3, 3, 3, 3, 3, 3), lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=8192, workgroup_size=512, threads_per_transform=512, factors=(16, 4, 16, 8), lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=9216, workgroup_size=512, threads_per_transform=512, factors=(4, 8, 4, 4, 3, 6), lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=10000, workgroup_size=512, threads_per_transform=500, factors=(4, 5, 5, 10, 10), lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=10240, workgroup_size=512, threads_per_transform=512, factors=(8, 4, 4, 4, 5, 4), lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=10752, workgroup_size=512, threads_per_transform=512, factors=(4, 16, 8, 7, 3), precision=['sp', 'hp'], lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=11200, workgroup_size=512, threads_per_transform=448, factors=(4, 7, 5, 16, 5), precision=['sp', 'hp'], lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=12288, workgroup_size=512, threads_per_transform=512, factors=(8, 8, 4, 6, 8), precision=['sp', 'hp'], lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=16384, workgroup_size=512, threads_per_transform=512, factors=(8, 16, 4, 8, 4), precision=['sp', 'hp'], lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=16807, workgroup_size=384, threads_per_transform=343, factors=(7, 7, 7, 7, 7), precision=['sp', 'hp'], lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=18816, workgroup_size=512, threads_per_transform=448, factors=(8, 8, 7, 7, 6), precision=['sp', 'hp'], lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=19200, workgroup_size=512, threads_per_transform=480, factors=(8, 10, 8, 5, 6), precision=['sp', 'hp'], lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=20480, workgroup_size=512, threads_per_transform=512, factors=(4, 4, 16, 10, 8), precision=['sp', 'hp'], lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), + NS(length=4704, workgroup_size=256, threads_per_transform=224, factors=(8, 4, 7, 7, 3), lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=5488, workgroup_size=256, threads_per_transform=196, factors=(7, 4, 7, 4, 7), lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=6144, workgroup_size=384, threads_per_transform=256, factors=(4, 8, 8, 8, 3), lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=6561, workgroup_size=256, threads_per_transform=243, factors=(3, 3, 3, 3, 3, 3, 3, 3), lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=8192, workgroup_size=512, threads_per_transform=512, factors=(16, 4, 16, 8), lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=9216, workgroup_size=512, threads_per_transform=512, factors=(4, 8, 4, 4, 3, 6), lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=10000, workgroup_size=512, threads_per_transform=500, factors=(4, 5, 5, 10, 10), lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=10240, workgroup_size=512, threads_per_transform=512, factors=(8, 4, 4, 4, 5, 4), lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=10752, workgroup_size=512, threads_per_transform=512, factors=(4, 16, 8, 7, 3), precision=['sp', 'hp'], lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=11200, workgroup_size=512, threads_per_transform=448, factors=(4, 7, 5, 16, 5), precision=['sp', 'hp'], lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=12288, workgroup_size=512, threads_per_transform=512, factors=(8, 8, 4, 6, 8), precision=['sp', 'hp'], lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=16384, workgroup_size=512, threads_per_transform=512, factors=(8, 16, 4, 8, 4), precision=['sp', 'hp'], lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=16807, workgroup_size=384, threads_per_transform=343, factors=(7, 7, 7, 7, 7), precision=['sp', 'hp'], lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=18816, workgroup_size=512, threads_per_transform=448, factors=(8, 8, 7, 7, 6), precision=['sp', 'hp'], lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=19200, workgroup_size=512, threads_per_transform=480, factors=(8, 10, 8, 5, 6), precision=['sp', 'hp'], lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=20480, workgroup_size=512, threads_per_transform=512, factors=(4, 4, 16, 10, 8), precision=['sp', 'hp'], lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), ] diff --git a/projects/rocfft/library/src/include/function_pool.h b/projects/rocfft/library/src/include/function_pool.h index b5e0c840cf5..347ba1729c0 100644 --- a/projects/rocfft/library/src/include/function_pool.h +++ b/projects/rocfft/library/src/include/function_pool.h @@ -213,26 +213,30 @@ class function_pool return best; } - const FMKey& get_actual_key(const FMKey& key) const + template + const TKey& get_actual_key(const TKey& key, TKeyPool& pool) const { // - for keys that we are querying with no/empty kernel-config, actually we are refering to // the default kernel-configs in kernel-generator.py. So get the actual keys to look-up // the pool. - // - if not in the def_key_pool, then we simply use itself (for dynamically added kernel) - auto it = find_key_in_map(def_key_pool, key); - if(it != def_key_pool.end()) - return it->second; - else - return key; - } + // - if not in the pool, then we simply use itself (for dynamically added kernel) - const PPFMKey& get_actual_key(const PPFMKey& key) const - { - auto it = find_key_in_map(def_pp_key_pool, key); - if(it != def_pp_key_pool.end()) + // First attempt an exact match with the given architecture in gcn_arch_name if possible + auto it = find_key_in_map(pool, key); + if(it != pool.end()) return it->second; else - return key; + { + // If a match is not found, try it with the generic arch kernel + auto key_copy = key; + key_copy.gcn_arch_name = generic_gcn_arch_name; + + auto it = find_key_in_map(pool, key_copy); + if(it != pool.end()) + return it->second; + else + return key; + } } public: @@ -285,13 +289,13 @@ class function_pool bool has_function(const FMKey& key) const { - auto real_key = get_actual_key(key); + auto real_key = get_actual_key(key, def_key_pool); return find_key_in_map(function_map, real_key) != function_map.end(); } bool has_function(const PPFMKey& key) const { - auto real_key = get_actual_key(key); + auto real_key = get_actual_key(key, def_pp_key_pool); return find_key_in_map(pp_function_map, real_key) != pp_function_map.end(); } @@ -330,7 +334,7 @@ class function_pool FFTKernel get_kernel(const FMKey& key) const { - auto real_key = get_actual_key(key); + auto real_key = get_actual_key(key, def_key_pool); auto it = find_key_in_map(function_map, real_key); if(it == function_map.end()) throw std::out_of_range("kernel not found in map"); @@ -339,7 +343,7 @@ class function_pool FFTKernel get_kernel(const PPFMKey& key, ComputeScheme scheme) const { - auto real_key = get_actual_key(key); + auto real_key = get_actual_key(key, def_pp_key_pool); auto it = find_key_in_map(pp_function_map, real_key); if(it == pp_function_map.end()) throw std::out_of_range("kernel not found in partial-pass map"); @@ -397,11 +401,12 @@ static void insert_default_entry(const FMKey& def_key, FPMap& function_map, size_t lds_size_bytes) { - FMKey def_key_with_lds = def_key; - def_key_with_lds.lds_size_bytes = lds_size_bytes; + FMKey def_key_with_lds = def_key; - if(def_key_with_lds.gcn_arch_name == generic_gcn_arch_name) - def_key_with_lds.gcn_arch_name = get_curr_gcn_arch_name(); + // Specifically add the current device's max LDS size if not a generic arch entry + def_key_with_lds.lds_size_bytes = def_key.gcn_arch_name == generic_gcn_arch_name + ? lds_size_bytes + : get_curr_device_prop().sharedMemPerBlock; // simple_key means the same thing as def_key, but we just remove kernel-config // so we don't need to know the exact config when we're lookin' for the default kernel @@ -421,11 +426,12 @@ static void insert_default_entry(const PPFMKey& def_key, PPFPMap& function_map, size_t lds_size_bytes) { - PPFMKey def_key_with_lds = def_key; - def_key_with_lds.lds_size_bytes = lds_size_bytes; + PPFMKey def_key_with_lds = def_key; - if(def_key_with_lds.gcn_arch_name == generic_gcn_arch_name) - def_key_with_lds.gcn_arch_name = get_curr_gcn_arch_name(); + // Specifically add the current device's max LDS size if not a generic arch entry + def_key_with_lds.lds_size_bytes = def_key.gcn_arch_name == generic_gcn_arch_name + ? lds_size_bytes + : get_curr_device_prop().sharedMemPerBlock; PPFMKey simple_key(def_key_with_lds); From 7217193753f136b07045101c682db0310cacb8f4 Mon Sep 17 00:00:00 2001 From: Flavio Teixeira Date: Thu, 27 Nov 2025 12:26:44 -0700 Subject: [PATCH 05/19] - CHANGELOG. --- projects/rocfft/CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/projects/rocfft/CHANGELOG.md b/projects/rocfft/CHANGELOG.md index 8bb525c491b..79c59518d82 100644 --- a/projects/rocfft/CHANGELOG.md +++ b/projects/rocfft/CHANGELOG.md @@ -5,6 +5,10 @@ Documentation for rocFFT is available at ## rocFFT 1.0.36 (unreleased) +### Added + +* Added support for per precision and architecture kernel configuration entries in the library. + ### Optimized * Removed a potential unnecessary global transpose operation from MPI 3D multi-GPU pencil decompositions. From 326090ef94a0c1bc1dbf20a19d1b59130fdc48f5 Mon Sep 17 00:00:00 2001 From: Flavio Teixeira Date: Mon, 1 Dec 2025 15:05:06 -0700 Subject: [PATCH 06/19] - Remove suffix from get_curr_gcn_arch_name(). --- projects/rocfft/shared/device_properties.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/projects/rocfft/shared/device_properties.h b/projects/rocfft/shared/device_properties.h index 81bbe39b202..67526cdbdd7 100644 --- a/projects/rocfft/shared/device_properties.h +++ b/projects/rocfft/shared/device_properties.h @@ -45,10 +45,18 @@ static hipDeviceProp_t get_curr_device_prop() return prop; } -// get device GCN arch name +// Get current device GCN arch name without any suffix after ':' static std::string get_curr_gcn_arch_name() { - return get_curr_device_prop().gcnArchName; + auto dev_prop = get_curr_device_prop(); + std::string arch_name_full = dev_prop.gcnArchName; + + auto suffix_char = ":"; + auto pos = arch_name_full.find(suffix_char); + if(pos != std::string::npos) + arch_name_full.erase(pos); + + return arch_name_full; } // check that the given grid/block dims will fit into the limits in From 8a9ee5b3d2e49ad63040a944e615f4c0eea20473 Mon Sep 17 00:00:00 2001 From: Flavio Teixeira Date: Thu, 4 Dec 2025 14:29:59 -0700 Subject: [PATCH 07/19] - Address review suggestion. --- projects/rocfft/shared/device_properties.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/projects/rocfft/shared/device_properties.h b/projects/rocfft/shared/device_properties.h index 67526cdbdd7..9640edaa5c1 100644 --- a/projects/rocfft/shared/device_properties.h +++ b/projects/rocfft/shared/device_properties.h @@ -51,8 +51,11 @@ static std::string get_curr_gcn_arch_name() auto dev_prop = get_curr_device_prop(); std::string arch_name_full = dev_prop.gcnArchName; - auto suffix_char = ":"; - auto pos = arch_name_full.find(suffix_char); + if(arch_name_full.empty()) + throw std::runtime_error("Device GCN arch name is empty."); + + auto delimiter = ":"; + auto pos = arch_name_full.find(delimiter); if(pos != std::string::npos) arch_name_full.erase(pos); From 7b0d13412ca2f69581e6b176435770ae17ca0001 Mon Sep 17 00:00:00 2001 From: Flavio Teixeira Date: Mon, 8 Dec 2025 11:05:15 -0700 Subject: [PATCH 08/19] - Add missing gcn_arch_name in kernel_name() --- projects/rocfft/library/src/device/kernel-generator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/projects/rocfft/library/src/device/kernel-generator.py b/projects/rocfft/library/src/device/kernel-generator.py index 80964b392ce..9ba8bc537db 100644 --- a/projects/rocfft/library/src/device/kernel-generator.py +++ b/projects/rocfft/library/src/device/kernel-generator.py @@ -428,6 +428,8 @@ def kernel_name(ns): if hasattr(ns, 'lds_size_bytes'): postfix += f'_lds{ns.lds_size_bytes}' + postfix += f'_{ns.gcn_arch_name}' + return f'rocfft_len{length}{postfix}' From 7055d54b172e264d172e11ab09883c1520303eed Mon Sep 17 00:00:00 2001 From: Flavio Teixeira Date: Mon, 8 Dec 2025 15:49:02 -0700 Subject: [PATCH 09/19] - Fix gcn_arch_name handling for partial-pass kernels. --- .../rocfft/library/src/device/kernel-generator.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/projects/rocfft/library/src/device/kernel-generator.py b/projects/rocfft/library/src/device/kernel-generator.py index 9ba8bc537db..b24fd290c36 100644 --- a/projects/rocfft/library/src/device/kernel-generator.py +++ b/projects/rocfft/library/src/device/kernel-generator.py @@ -338,7 +338,7 @@ def generate_cpu_function_pool_pieces(functions, pp_functions, num_files): # get first pp kernel f_pp_1 = pp_functions[counter_f_pp_1] - # PPFMKey entry needs two kernels with same length and precision, but different pp_current_dim + # PPFMKey entry needs two kernels with same length, precision, and arch, but different pp_current_dim counter_f_pp_2 = counter_f_pp_1 + 1 if counter_f_pp_2 >= len(pp_functions): break @@ -346,14 +346,16 @@ def generate_cpu_function_pool_pieces(functions, pp_functions, num_files): while counter_f_pp_2 < len(pp_functions): f_pp_2 = pp_functions[counter_f_pp_2] if (f_pp_1.meta.length == f_pp_2.meta.length - and f_pp_1.meta.precision == f_pp_2.meta.precision + and f_pp_1.meta.precision == f_pp_2.meta.precision and + f_pp_1.meta.gcn_arch_name == f_pp_2.meta.gcn_arch_name and f_pp_1.meta.pp_current_dim != f_pp_2.meta.pp_current_dim): break if (f_pp_1.meta.length != f_pp_2.meta.length or (f_pp_1.meta.length == f_pp_2.meta.length - and f_pp_1.meta.precision != f_pp_2.meta.precision)): - # we hit a new kernel with different length/precision + and f_pp_1.meta.precision != f_pp_2.meta.precision and + f_pp_1.meta.gcn_arch_name != f_pp_2.meta.gcn_arch_name)): + # we hit a new kernel with different length/precision/arch # start next iteration looking for the next pair counter_f_pp_1 = counter_f_pp_2 skip_to_next_iter = True @@ -376,14 +378,14 @@ def generate_cpu_function_pool_pieces(functions, pp_functions, num_files): length = f_pp_1.meta.length precision = f_pp_1.meta.precision scheme = f_pp_1.meta.scheme + arch_name = f_pp_1.meta.gcn_arch_name key = Call(name='PPFMKey', arguments=ArgumentList( length[0], length[1], length[2], precisions[precision], scheme, 'pp_kernel_1.get_kernel_config()', 'pp_kernel_2.get_kernel_config()', - ''.join(['"', f_pp_1.meta.gcn_arch_name, - '"']))).inline() + ''.join(['"', arch_name, '"']))).inline() piece_contents[curr_file] += function_map.insert_pp( key, var_pp_kernel_1, var_pp_kernel_2, 'std::get<1>(def_keys)', 'std::get<1>(function_maps)', f_pp_1.meta.lds_size_bytes) From 554633e57ca2060cf1be9f62454d578bed864c35 Mon Sep 17 00:00:00 2001 From: Flavio Teixeira Date: Mon, 8 Dec 2025 18:54:40 -0700 Subject: [PATCH 10/19] - Fix partial-pass kernel check. --- projects/rocfft/library/src/device/kernel-generator.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/projects/rocfft/library/src/device/kernel-generator.py b/projects/rocfft/library/src/device/kernel-generator.py index b24fd290c36..8d6c635d404 100644 --- a/projects/rocfft/library/src/device/kernel-generator.py +++ b/projects/rocfft/library/src/device/kernel-generator.py @@ -351,10 +351,10 @@ def generate_cpu_function_pool_pieces(functions, pp_functions, num_files): and f_pp_1.meta.pp_current_dim != f_pp_2.meta.pp_current_dim): break - if (f_pp_1.meta.length != f_pp_2.meta.length or - (f_pp_1.meta.length == f_pp_2.meta.length - and f_pp_1.meta.precision != f_pp_2.meta.precision and - f_pp_1.meta.gcn_arch_name != f_pp_2.meta.gcn_arch_name)): + if (f_pp_1.meta.length != f_pp_2.meta.length or ( + f_pp_1.meta.length == f_pp_2.meta.length and + (f_pp_1.meta.precision != f_pp_2.meta.precision or + f_pp_1.meta.gcn_arch_name != f_pp_2.meta.gcn_arch_name))): # we hit a new kernel with different length/precision/arch # start next iteration looking for the next pair counter_f_pp_1 = counter_f_pp_2 From 24db8682975988da14f75ff78cd280ddc49fdaeb Mon Sep 17 00:00:00 2001 From: Flavio Teixeira Date: Wed, 10 Dec 2025 09:59:49 -0700 Subject: [PATCH 11/19] - Add comment. --- projects/rocfft/library/src/device/kernel-generator.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/projects/rocfft/library/src/device/kernel-generator.py b/projects/rocfft/library/src/device/kernel-generator.py index 8d6c635d404..dc88206799d 100644 --- a/projects/rocfft/library/src/device/kernel-generator.py +++ b/projects/rocfft/library/src/device/kernel-generator.py @@ -179,8 +179,15 @@ def set_bytes_per_element(kernels): wgs calculation can be overridden by using wgs_is_derived = true in kernel config. NOTE: Once all kernels are tuned by precision, this function can most likely go away. """ + d = dict() - # add precision entries for a given (length, scheme, lds_size_bytes) key + + # For a given kernel key (length, scheme, lds_size_bytes, gcn_arch_name), + # we can have double-, single-, and half-precision entries, but those entries + # can be completely out of order in kernels list. We need the first loop to know, + # for a given key, what precision entries are available. The second loop assigns + # the maximum bytes per element, based on the available precision entries for a + # given key. for kernel in kernels: key = get_kernel_key(kernel) if key not in d: From ea12d45fe867efc6e895c07eb27ba402618941cd Mon Sep 17 00:00:00 2001 From: Flavio Teixeira Date: Wed, 10 Dec 2025 14:05:46 -0700 Subject: [PATCH 12/19] - No need to put device arch name in StockhamGeneratorSpecs for aot and solution map kernel builds. --- projects/rocfft/library/src/rocfft_aot_helper.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/rocfft/library/src/rocfft_aot_helper.cpp b/projects/rocfft/library/src/rocfft_aot_helper.cpp index 300baebc16c..2523b19dc55 100644 --- a/projects/rocfft/library/src/rocfft_aot_helper.cpp +++ b/projects/rocfft/library/src/rocfft_aot_helper.cpp @@ -246,7 +246,7 @@ void build_stockham_function_pool(CompileQueue& queue) StockhamGeneratorSpecs specs{factors, {}, static_cast(precision), - get_curr_gcn_arch_name(), + "", static_cast(i.second.workgroup_size), PrintScheme(scheme)}; specs.threads_per_transform = i.second.threads_per_transform[0]; @@ -665,7 +665,7 @@ void build_solution_kernels(CompileQueue& queue) StockhamGeneratorSpecs specs{factors, {}, static_cast(precision), - get_curr_gcn_arch_name(), + "", static_cast(config.workgroup_size), PrintScheme(scheme)}; specs.threads_per_transform = config.threads_per_transform[0]; From 84672356ef3d8064b4547f02beb616257b182e7e Mon Sep 17 00:00:00 2001 From: Flavio Teixeira Date: Wed, 10 Dec 2025 14:27:06 -0700 Subject: [PATCH 13/19] - Remove no longer needed calls to get_curr_gcn_arch_name to construct StockhamGeneratorSpecs. --- projects/rocfft/library/src/rocfft_kernel_config_search.cpp | 2 +- projects/rocfft/library/src/rtc_stockham_kernel.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/projects/rocfft/library/src/rocfft_kernel_config_search.cpp b/projects/rocfft/library/src/rocfft_kernel_config_search.cpp index 5fa68d3ef78..9017942c86f 100644 --- a/projects/rocfft/library/src/rocfft_kernel_config_search.cpp +++ b/projects/rocfft/library/src/rocfft_kernel_config_search.cpp @@ -161,7 +161,7 @@ std::string test_kernel_src(const std::string& kernel_name, StockhamGeneratorSpecs specs{factorization, {}, static_cast(rocfft_precision_single), - get_curr_gcn_arch_name(), + "", wgs, PrintScheme(compute_scheme)}; diff --git a/projects/rocfft/library/src/rtc_stockham_kernel.cpp b/projects/rocfft/library/src/rtc_stockham_kernel.cpp index ad4e5be59f6..a2983785028 100644 --- a/projects/rocfft/library/src/rtc_stockham_kernel.cpp +++ b/projects/rocfft/library/src/rtc_stockham_kernel.cpp @@ -79,7 +79,7 @@ RTCKernel::RTCGenerator RTCKernelStockham::generate_from_node(const LeafNode& specs.emplace(factors, std::vector(), precision, - get_curr_gcn_arch_name(), + "", static_cast(kernel->workgroup_size), PrintScheme(node.scheme)); specs->threads_per_transform = kernel->threads_per_transform[0]; @@ -127,7 +127,7 @@ RTCKernel::RTCGenerator RTCKernelStockham::generate_from_node(const LeafNode& specs.emplace(factors1d, factors2d, precision, - get_curr_gcn_arch_name(), + "", static_cast(kernel->workgroup_size), PrintScheme(node.scheme)); specs->threads_per_transform = kernel->threads_per_transform[0]; @@ -137,7 +137,7 @@ RTCKernel::RTCGenerator RTCKernelStockham::generate_from_node(const LeafNode& specs2d.emplace(factors2d, factors1d, precision, - get_curr_gcn_arch_name(), + "", static_cast(kernel->workgroup_size), PrintScheme(node.scheme)); specs2d->threads_per_transform = kernel->threads_per_transform[1]; From a429b2bc58b72d028008322a03682cfa00d984d1 Mon Sep 17 00:00:00 2001 From: Flavio Teixeira Date: Wed, 10 Dec 2025 14:33:10 -0700 Subject: [PATCH 14/19] - Remove header include. --- projects/rocfft/library/src/rocfft_kernel_config_search.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/projects/rocfft/library/src/rocfft_kernel_config_search.cpp b/projects/rocfft/library/src/rocfft_kernel_config_search.cpp index 9017942c86f..32fed2b46fa 100644 --- a/projects/rocfft/library/src/rocfft_kernel_config_search.cpp +++ b/projects/rocfft/library/src/rocfft_kernel_config_search.cpp @@ -31,7 +31,6 @@ #include "../../shared/CLI11.hpp" #include "../../shared/arithmetic.h" -#include "../../shared/device_properties.h" #include "../../shared/gpubuf.h" #include "../../shared/hip_object_wrapper.h" #include "device/generator/stockham_gen.h" From 6114b2f7827014b4488cafbd34eafcebe1d70a60 Mon Sep 17 00:00:00 2001 From: Flavio Teixeira Date: Wed, 10 Dec 2025 15:32:49 -0700 Subject: [PATCH 15/19] - Handle the case where FMKey is constructed without a visible HIP device. --- projects/rocfft/shared/device_properties.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/projects/rocfft/shared/device_properties.h b/projects/rocfft/shared/device_properties.h index 9640edaa5c1..e8095ef24e2 100644 --- a/projects/rocfft/shared/device_properties.h +++ b/projects/rocfft/shared/device_properties.h @@ -46,8 +46,15 @@ static hipDeviceProp_t get_curr_device_prop() } // Get current device GCN arch name without any suffix after ':' +// If no devices are found, return empty string static std::string get_curr_gcn_arch_name() { + int dev_count = 0; + if(hipGetDeviceCount(&dev_count) != hipSuccess) + throw std::runtime_error("hipGetDeviceCount failed."); + if(dev_count == 0) + return ""; + auto dev_prop = get_curr_device_prop(); std::string arch_name_full = dev_prop.gcnArchName; From 0980785daabbf8e0bd9560ea1628490f721593a2 Mon Sep 17 00:00:00 2001 From: Flavio Teixeira Date: Wed, 10 Dec 2025 15:33:11 -0700 Subject: [PATCH 16/19] Revert " - No need to put device arch name in StockhamGeneratorSpecs for aot and solution map kernel builds." This reverts commit ea12d45fe867efc6e895c07eb27ba402618941cd. --- projects/rocfft/library/src/rocfft_aot_helper.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/rocfft/library/src/rocfft_aot_helper.cpp b/projects/rocfft/library/src/rocfft_aot_helper.cpp index 2523b19dc55..300baebc16c 100644 --- a/projects/rocfft/library/src/rocfft_aot_helper.cpp +++ b/projects/rocfft/library/src/rocfft_aot_helper.cpp @@ -246,7 +246,7 @@ void build_stockham_function_pool(CompileQueue& queue) StockhamGeneratorSpecs specs{factors, {}, static_cast(precision), - "", + get_curr_gcn_arch_name(), static_cast(i.second.workgroup_size), PrintScheme(scheme)}; specs.threads_per_transform = i.second.threads_per_transform[0]; @@ -665,7 +665,7 @@ void build_solution_kernels(CompileQueue& queue) StockhamGeneratorSpecs specs{factors, {}, static_cast(precision), - "", + get_curr_gcn_arch_name(), static_cast(config.workgroup_size), PrintScheme(scheme)}; specs.threads_per_transform = config.threads_per_transform[0]; From 21e66aecf7044cf97aafad5908c00d8732e831f7 Mon Sep 17 00:00:00 2001 From: Flavio Teixeira Date: Wed, 10 Dec 2025 15:33:21 -0700 Subject: [PATCH 17/19] Revert " - Remove no longer needed calls to get_curr_gcn_arch_name to construct StockhamGeneratorSpecs." This reverts commit 84672356ef3d8064b4547f02beb616257b182e7e. --- projects/rocfft/library/src/rocfft_kernel_config_search.cpp | 2 +- projects/rocfft/library/src/rtc_stockham_kernel.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/projects/rocfft/library/src/rocfft_kernel_config_search.cpp b/projects/rocfft/library/src/rocfft_kernel_config_search.cpp index 32fed2b46fa..0a821d932d6 100644 --- a/projects/rocfft/library/src/rocfft_kernel_config_search.cpp +++ b/projects/rocfft/library/src/rocfft_kernel_config_search.cpp @@ -160,7 +160,7 @@ std::string test_kernel_src(const std::string& kernel_name, StockhamGeneratorSpecs specs{factorization, {}, static_cast(rocfft_precision_single), - "", + get_curr_gcn_arch_name(), wgs, PrintScheme(compute_scheme)}; diff --git a/projects/rocfft/library/src/rtc_stockham_kernel.cpp b/projects/rocfft/library/src/rtc_stockham_kernel.cpp index a2983785028..ad4e5be59f6 100644 --- a/projects/rocfft/library/src/rtc_stockham_kernel.cpp +++ b/projects/rocfft/library/src/rtc_stockham_kernel.cpp @@ -79,7 +79,7 @@ RTCKernel::RTCGenerator RTCKernelStockham::generate_from_node(const LeafNode& specs.emplace(factors, std::vector(), precision, - "", + get_curr_gcn_arch_name(), static_cast(kernel->workgroup_size), PrintScheme(node.scheme)); specs->threads_per_transform = kernel->threads_per_transform[0]; @@ -127,7 +127,7 @@ RTCKernel::RTCGenerator RTCKernelStockham::generate_from_node(const LeafNode& specs.emplace(factors1d, factors2d, precision, - "", + get_curr_gcn_arch_name(), static_cast(kernel->workgroup_size), PrintScheme(node.scheme)); specs->threads_per_transform = kernel->threads_per_transform[0]; @@ -137,7 +137,7 @@ RTCKernel::RTCGenerator RTCKernelStockham::generate_from_node(const LeafNode& specs2d.emplace(factors2d, factors1d, precision, - "", + get_curr_gcn_arch_name(), static_cast(kernel->workgroup_size), PrintScheme(node.scheme)); specs2d->threads_per_transform = kernel->threads_per_transform[1]; From f1e5984e9556835b374326e95cae2ffd96c9d883 Mon Sep 17 00:00:00 2001 From: Flavio Teixeira Date: Wed, 10 Dec 2025 15:33:25 -0700 Subject: [PATCH 18/19] Revert " - Remove header include." This reverts commit a429b2bc58b72d028008322a03682cfa00d984d1. --- projects/rocfft/library/src/rocfft_kernel_config_search.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/projects/rocfft/library/src/rocfft_kernel_config_search.cpp b/projects/rocfft/library/src/rocfft_kernel_config_search.cpp index 0a821d932d6..5fa68d3ef78 100644 --- a/projects/rocfft/library/src/rocfft_kernel_config_search.cpp +++ b/projects/rocfft/library/src/rocfft_kernel_config_search.cpp @@ -31,6 +31,7 @@ #include "../../shared/CLI11.hpp" #include "../../shared/arithmetic.h" +#include "../../shared/device_properties.h" #include "../../shared/gpubuf.h" #include "../../shared/hip_object_wrapper.h" #include "device/generator/stockham_gen.h" From 36a30b72f6aebcadca8fd44e5630b4bb364e5ecc Mon Sep 17 00:00:00 2001 From: Flavio Teixeira Date: Wed, 10 Dec 2025 15:49:36 -0700 Subject: [PATCH 19/19] - Take hip return into account. --- projects/rocfft/shared/device_properties.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/projects/rocfft/shared/device_properties.h b/projects/rocfft/shared/device_properties.h index e8095ef24e2..4d4c001a55a 100644 --- a/projects/rocfft/shared/device_properties.h +++ b/projects/rocfft/shared/device_properties.h @@ -49,10 +49,9 @@ static hipDeviceProp_t get_curr_device_prop() // If no devices are found, return empty string static std::string get_curr_gcn_arch_name() { - int dev_count = 0; - if(hipGetDeviceCount(&dev_count) != hipSuccess) - throw std::runtime_error("hipGetDeviceCount failed."); - if(dev_count == 0) + int dev_count = 0; + auto ret = hipGetDeviceCount(&dev_count); + if(ret != hipSuccess || dev_count == 0) return ""; auto dev_prop = get_curr_device_prop();