diff --git a/projects/rocfft/CHANGELOG.md b/projects/rocfft/CHANGELOG.md index 7d3a5e828b3..dc93e716068 100644 --- a/projects/rocfft/CHANGELOG.md +++ b/projects/rocfft/CHANGELOG.md @@ -5,6 +5,10 @@ Documentation for rocFFT is available at ## rocFFT 1.0.36 (unreleased) +### Added + +* Added support for per precision and architecture kernel configuration entries in the library. + ### Optimized * Removed a potential unnecessary global transpose operation from MPI 3D multi-GPU pencil decompositions. diff --git a/projects/rocfft/library/src/CMakeLists.txt b/projects/rocfft/library/src/CMakeLists.txt index ce6ae5ec794..8ad2dddc063 100644 --- a/projects/rocfft/library/src/CMakeLists.txt +++ b/projects/rocfft/library/src/CMakeLists.txt @@ -151,7 +151,7 @@ set( kgen_logic_files # python code that decides kernel parameters ${CMAKE_SOURCE_DIR}/library/src/device/kernel-generator.py ${CMAKE_SOURCE_DIR}/library/src/device/generator.py - ${CMAKE_SOURCE_DIR}/library/src/device/kernels/configs/config_lds.py + ${CMAKE_SOURCE_DIR}/library/src/device/kernels/configs/config_arch.py ${CMAKE_SOURCE_DIR}/library/src/device/kernels/configs/config_sbrr.py ${CMAKE_SOURCE_DIR}/library/src/device/kernels/configs/config_sbcc.py ${CMAKE_SOURCE_DIR}/library/src/device/kernels/configs/config_sbcr.py diff --git a/projects/rocfft/library/src/device/generator/stockham_gen.cpp b/projects/rocfft/library/src/device/generator/stockham_gen.cpp index e7f921cd42f..8ca2d802975 100644 --- a/projects/rocfft/library/src/device/generator/stockham_gen.cpp +++ b/projects/rocfft/library/src/device/generator/stockham_gen.cpp @@ -22,6 +22,7 @@ using namespace std::placeholders; #include "../../../../shared/arithmetic.h" +#include "../../../../shared/device_properties.h" #include "../../../../shared/precision_type.h" #include "generator.h" #include "stockham_gen.h" @@ -52,7 +53,8 @@ struct GeneratedLauncher const std::vector& pp_factors_other, const unsigned int& pp_current_dim, const unsigned int& pp_off_dim, - bool double_precision, + const unsigned int& precision_type, + const std::string& gcn_arch_name, const std::string& sbrc_type, const std::string& sbrc_transpose_type) : scheme(scheme) @@ -69,7 +71,9 @@ struct GeneratedLauncher , direct_to_from_reg(kernel.direct_to_from_reg) , sbrc_type(sbrc_type) , sbrc_transpose_type(sbrc_transpose_type) - , double_precision(double_precision) + , precision_type(precision_type) + , gcn_arch_name(gcn_arch_name) + { } @@ -90,7 +94,10 @@ struct GeneratedLauncher // SBRC transpose type std::string sbrc_type; std::string sbrc_transpose_type; - bool double_precision; + + unsigned int precision_type; + + std::string gcn_arch_name; // output a json object that the python generator can parse to know // how to build the function pool @@ -129,7 +136,8 @@ struct GeneratedLauncher add_member("direct_to_from_reg", direct_to_from_reg ? "true" : "false"); add_member("sbrc_type", quote_str(sbrc_type)); add_member("sbrc_transpose_type", quote_str(sbrc_transpose_type)); - add_member("double_precision", double_precision ? "true" : "false"); + add_member("precision_type", std::to_string(precision_type)); + add_member("gcn_arch_name", quote_str(gcn_arch_name)); add_member("pp_child_scheme", quote_str(pp_child_scheme)); add_member("pp_factors_curr", vec_to_list(pp_factors_curr)); add_member("pp_factors_other", vec_to_list(pp_factors_other)); @@ -149,9 +157,10 @@ struct LaunchSuffix std::string sbrc_transpose_type; }; -void make_launcher(const std::vector& precision_types, +void make_launcher(const unsigned int& precision_type, const std::vector& launcher_suffixes, StockhamKernel& kernel, + const std::string& gcn_arch_name, const std::string& pp_child_scheme, const std::vector& pp_factors_curr, const std::vector& pp_factors_other, @@ -159,21 +168,20 @@ void make_launcher(const std::vector& precision_types, const unsigned int& pp_off_dim, std::vector& generated_launchers) { - for(auto precision_type : precision_types) + + for(auto&& launcher : launcher_suffixes) { - for(auto&& launcher : launcher_suffixes) - { - generated_launchers.emplace_back(kernel, - launcher.scheme, - pp_child_scheme, - pp_factors_curr, - pp_factors_other, - pp_current_dim, - pp_off_dim, - precision_type == rocfft_precision_double, - launcher.sbrc_type, - launcher.sbrc_transpose_type); - } + generated_launchers.emplace_back(kernel, + launcher.scheme, + pp_child_scheme, + pp_factors_curr, + pp_factors_other, + pp_current_dim, + pp_off_dim, + precision_type, + gcn_arch_name, + launcher.sbrc_type, + launcher.sbrc_transpose_type); } } @@ -255,9 +263,10 @@ void stockham_partial_pass_variants(const std::string& kernel_name if(params_1.current_dim == 0 && params_2.current_dim == 2) { StockhamPartialPassKernelRR kernelRR(specs1, params_1); - make_launcher(specs1.precisions, + make_launcher(specs1.precision, {{"pp_stoc", specs1.scheme, "", ""}}, kernelRR, + specs1.gcn_arch_name, "CS_KERNEL_STOCKHAM_PP", params_1.pp_factors_curr, params_1.pp_factors_other, @@ -266,9 +275,10 @@ void stockham_partial_pass_variants(const std::string& kernel_name launchers); StockhamPartialPassKernelCC kernelCC(specs2, params_2, false); - make_launcher(specs2.precisions, + make_launcher(specs2.precision, {{"pp_sbcc", specs2.scheme, "", ""}}, kernelCC, + specs2.gcn_arch_name, "CS_KERNEL_STOCKHAM_PP_BLOCK_CC", params_2.pp_factors_curr, params_2.pp_factors_other, @@ -279,9 +289,10 @@ void stockham_partial_pass_variants(const std::string& kernel_name else if(params_1.current_dim == 2 && params_2.current_dim == 0) { StockhamPartialPassKernelRR kernelCC(specs1, params_1); - make_launcher(specs1.precisions, + make_launcher(specs1.precision, {{"pp_sbcc", specs1.scheme, "", ""}}, kernelCC, + specs1.gcn_arch_name, "CS_KERNEL_STOCKHAM_PP_BLOCK_CC", params_1.pp_factors_curr, params_1.pp_factors_other, @@ -290,9 +301,10 @@ void stockham_partial_pass_variants(const std::string& kernel_name launchers); StockhamPartialPassKernelCC kernelRR(specs2, params_2, false); - make_launcher(specs2.precisions, + make_launcher(specs2.precision, {{"pp_stoc", specs2.scheme, "", ""}}, kernelRR, + specs2.gcn_arch_name, "CS_KERNEL_STOCKHAM_PP", params_2.pp_factors_curr, params_2.pp_factors_other, @@ -341,9 +353,10 @@ void stockham_variants(const std::string& kernel_name, if(specs.scheme == "CS_KERNEL_STOCKHAM") { StockhamKernelRR kernel(specs); - make_launcher(specs.precisions, + make_launcher(specs.precision, {{"stoc", specs.scheme, "", ""}}, kernel, + specs.gcn_arch_name, "CS_NONE", std::vector(), std::vector(), @@ -354,9 +367,10 @@ void stockham_variants(const std::string& kernel_name, else if(specs.scheme == "CS_KERNEL_STOCKHAM_BLOCK_CC") { StockhamKernelCC kernel(specs, false, false); - make_launcher(specs.precisions, + make_launcher(specs.precision, {{"sbcc", specs.scheme, "", ""}}, kernel, + specs.gcn_arch_name, "CS_NONE", std::vector(), std::vector(), @@ -405,9 +419,10 @@ void stockham_variants(const std::string& kernel_name, "SBRC_3D_FFT_ERC_TRANS_Z_XY", "TILE_UNALIGNED"}); - make_launcher(specs.precisions, + make_launcher(specs.precision, suffixes, kernel, + specs.gcn_arch_name, "CS_NONE", std::vector(), std::vector(), @@ -419,9 +434,10 @@ void stockham_variants(const std::string& kernel_name, { StockhamKernelCR kernel(specs); - make_launcher(specs.precisions, + make_launcher(specs.precision, {{"sbcr", specs.scheme, "", ""}}, kernel, + specs.gcn_arch_name, "CS_NONE", std::vector(), std::vector(), @@ -433,20 +449,17 @@ void stockham_variants(const std::string& kernel_name, { StockhamKernelFused2D fused2d(specs, specs2d); - // output 2D launchers - for(auto prec_type : specs.precisions) - { - launchers.emplace_back(fused2d, - specs.scheme, - "CS_NONE", - std::vector(), - std::vector(), - 0, - 0, - (prec_type == rocfft_precision_double), - "", - ""); - } + launchers.emplace_back(fused2d, + specs.scheme, + "CS_NONE", + std::vector(), + std::vector(), + 0, + 0, + specs.precision, + specs.gcn_arch_name, + "", + ""); } else throw std::runtime_error("unhandled scheme"); @@ -454,18 +467,6 @@ void stockham_variants(const std::string& kernel_name, output_json(launchers, kernel_name, output); } -static size_t max_bytes_per_element(const std::vector& precisions) -{ - // generate for the maximum element size in the available - // precisions - size_t element_size = 0; - for(auto p : precisions) - { - element_size = std::max(element_size, complex_type_size(static_cast(p))); - } - return element_size; -} - // ========================================================= // Partial pass parameters row-major ordering helpers. // Kernel configuration parameters for CS_3D_PP in @@ -636,9 +637,11 @@ int main() std::string line; std::string kernel_name; + std::string gcn_arch_name; std::string scheme; bool half_lds; unsigned int lds_size_bytes; + unsigned int bytes_per_element; const char* DELIM = ""; std::cout << "{"; @@ -661,6 +664,9 @@ int main() lds_size_bytes = std::stoul(*arg); ++arg; + bytes_per_element = std::stoul(*arg); + ++arg; + std::string kernel_name = *arg; ++arg; @@ -698,8 +704,11 @@ int main() threads_per_transform = parse_uints_csv(*arg); ++arg; - std::vector precisions; - precisions = parse_uints_csv(*arg); + gcn_arch_name = *arg; + + ++arg; + unsigned int precision; + precision = std::stoul(*arg); // create spec and pass to stockham_variants, writes partial output to stdout std::cout << DELIM; @@ -733,12 +742,14 @@ int main() if(direct_to_from_reg.size() != 2) throw std::runtime_error("CS_3D_PP requires two direct_to_from_reg configuration"); - StockhamGeneratorSpecs specs1(factors1, {}, precisions, workgroup_size[0], scheme); + StockhamGeneratorSpecs specs1( + factors1, {}, precision, gcn_arch_name, workgroup_size[0], scheme); specs1.direct_to_from_reg = direct_to_from_reg[0]; specs1.threads_per_transform = threads_per_transform[0]; specs1.wgs_is_derived = true; - StockhamGeneratorSpecs specs2(factors2, {}, precisions, workgroup_size[1], scheme); + StockhamGeneratorSpecs specs2( + factors2, {}, precision, gcn_arch_name, workgroup_size[1], scheme); specs2.direct_to_from_reg = direct_to_from_reg[1]; specs2.threads_per_transform = threads_per_transform[1]; specs2.wgs_is_derived = true; @@ -769,24 +780,28 @@ int main() ++arg; factors = parse_uints_csv(*arg); - StockhamGeneratorSpecs specs(factors, factors2d, precisions, workgroup_size[0], scheme); + StockhamGeneratorSpecs specs( + factors, factors2d, precision, gcn_arch_name, workgroup_size[0], scheme); specs.half_lds = half_lds; specs.direct_to_from_reg = direct_to_from_reg[0]; - specs.bytes_per_element = max_bytes_per_element(precisions); + specs.bytes_per_element = bytes_per_element; specs.threads_per_transform = threads_per_transform.front(); // second dimension for 2D_SINGLE StockhamGeneratorSpecs specs2d( - factors2d, factors, precisions, workgroup_size[0], scheme); + factors2d, factors, precision, gcn_arch_name, workgroup_size[0], scheme); if(!threads_per_transform.empty()) specs2d.threads_per_transform = threads_per_transform.back(); // 2D_SINGLE kernels use the specified workgroup size - // directly - if(scheme == "CS_KERNEL_2D_SINGLE") + // directly. + // Kernels with an architecture other than generic will + // also use the workgroup size directly, as the calculations + // for wgs_is_derived=false assume an LDS size of 64KiB. + if(scheme == "CS_KERNEL_2D_SINGLE" || gcn_arch_name != generic_gcn_arch_name) { specs.wgs_is_derived = true; specs2d.wgs_is_derived = true; diff --git a/projects/rocfft/library/src/device/generator/stockham_gen.h b/projects/rocfft/library/src/device/generator/stockham_gen.h index b71cb57e3e9..7fe259b5161 100644 --- a/projects/rocfft/library/src/device/generator/stockham_gen.h +++ b/projects/rocfft/library/src/device/generator/stockham_gen.h @@ -32,12 +32,14 @@ struct StockhamGeneratorSpecs { StockhamGeneratorSpecs(const std::vector& factors, const std::vector& factors2d, - const std::vector& precisions, + unsigned int precision, + const std::string& gcn_arch_name, unsigned int workgroup_size, const std::string& scheme) : factors(factors) , factors2d(factors2d) - , precisions(precisions) + , precision(precision) + , gcn_arch_name(gcn_arch_name) , length(product(factors.begin(), factors.end())) , length2d(product(factors2d.begin(), factors2d.end())) , workgroup_size(workgroup_size) @@ -47,7 +49,8 @@ struct StockhamGeneratorSpecs std::vector factors; std::vector factors2d; - std::vector precisions; // mapped from rocfft_precision + unsigned int precision; // mapped from rocfft_precision + std::string gcn_arch_name; unsigned int length; unsigned int length2d = 0; diff --git a/projects/rocfft/library/src/device/kernel-generator.py b/projects/rocfft/library/src/device/kernel-generator.py index b2b943b0106..dc88206799d 100644 --- a/projects/rocfft/library/src/device/kernel-generator.py +++ b/projects/rocfft/library/src/device/kernel-generator.py @@ -32,6 +32,7 @@ import sys import json import threading +import copy import kernels.configs.config_sbrc as config_sbrc import kernels.configs.config_sbrr as config_sbrr @@ -39,8 +40,8 @@ import kernels.configs.config_sbcr as config_sbcr import kernels.configs.config_2d_single as config_2d_single import kernels.configs.config_pp_3d as config_pp_3d +import kernels.configs.config_arch as config_arch -from copy import deepcopy from pathlib import Path from types import SimpleNamespace as NS from operator import mul @@ -85,20 +86,127 @@ def cjoin(xs): # # Helpers # -def unique(kernels): - """Merge kernel lists without duplicated meta.length; ignore later ones.""" +def get_kernel_key(kernel): + """Return a key tuple for a kernel based on (length, scheme, lds_size_bytes, gcn_arch_name).""" + if isinstance(kernel.length, list): + length_key = tuple(kernel.length) + else: + length_key = kernel.length + + key = (length_key, kernel.scheme, kernel.lds_size_bytes, + kernel.gcn_arch_name) + + return key + + +def merge_kernel_list(kernels, all_precisions): + """Merge precision and architecture lists with kernel list. + Check for duplicated kernel and invalid precision/arch entries.""" + r, s = list(), set() + + all_archs = [member.value for member in config_arch.supported_arch] + all_lds_configs = [member.value for member in config_arch.lds_config] + + lds_size_err_msg = "Error: invalid lds_size_bytes in kernel configuration: \n" + arch_err_msg = "Error: invalid architecture in kernel configuration: \n" + prec_err_msg = "Error: invalid precision in kernel configuration: \n" + dup_err_msg = "Error: duplicated entry in kernel configuration: \n" + for kernel in kernels: - if isinstance(kernel.length, list): - key = tuple(kernel.length) + (kernel.scheme, kernel.lds_size_bytes) + if hasattr(kernel, 'precision'): + if not isinstance(kernel.precision, list): + kernel.precision = [kernel.precision] + + precisions = kernel.precision + else: + precisions = all_precisions + + if hasattr(kernel, 'gcn_arch_name'): + if not isinstance(kernel.gcn_arch_name, list): + kernel.gcn_arch_name = [kernel.gcn_arch_name] + archs = kernel.gcn_arch_name + if hasattr(kernel, 'lds_size_bytes'): + # lds size not allowed to be specified per arch + print(lds_size_err_msg + str(kernel)) + sys.exit(1) + # lds size will be determined at runtime based on arch + kernel.lds_size_bytes = 0 else: - key = (kernel.length, kernel.scheme, kernel.lds_size_bytes) - if key not in s: - s.add(key) - r.append(kernel) + # default to gfx generic if no arch specified + archs = [config_arch.supported_arch.GFX_GENERIC.value] + if hasattr(kernel, 'lds_size_bytes'): + if isinstance(kernel.lds_size_bytes, list): + # only one lds size allowed if gfx generic + print(lds_size_err_msg + str(kernel)) + sys.exit(1) + + if kernel.lds_size_bytes not in all_lds_configs: + print(lds_size_err_msg + str(kernel)) + sys.exit(1) + else: + # default lds size to 64KiB + kernel.lds_size_bytes = config_arch.lds_config.SIZE_64KiB.value + + for a in archs: + if a not in all_archs: + print(arch_err_msg + str(kernel)) + sys.exit(1) + for p in precisions: + if p not in all_precisions: + print(prec_err_msg + str(kernel)) + sys.exit(1) + + kernel_cpy = copy.copy(kernel) + kernel_cpy.precision = p + kernel_cpy.gcn_arch_name = a + + key = (get_kernel_key(kernel_cpy), kernel_cpy.precision) + + if key not in s: + s.add(key) + r.append(kernel_cpy) + else: + print(dup_err_msg + str(kernel)) + sys.exit(1) return r +def set_bytes_per_element(kernels): + """Default behavior for bytes_per_element calculation is to use the highest precision + for a given kernel key (length, scheme, lds_size_bytes). bytes_per_element is used to + calculate transforms_per_block and workgroup_size later on. This behavior for tpb and + wgs calculation can be overridden by using wgs_is_derived = true in kernel config. + NOTE: Once all kernels are tuned by precision, this function can most likely go away. + """ + + d = dict() + + # For a given kernel key (length, scheme, lds_size_bytes, gcn_arch_name), + # we can have double-, single-, and half-precision entries, but those entries + # can be completely out of order in kernels list. We need the first loop to know, + # for a given key, what precision entries are available. The second loop assigns + # the maximum bytes per element, based on the available precision entries for a + # given key. + for kernel in kernels: + key = get_kernel_key(kernel) + if key not in d: + d[key] = list() + d[key].append(kernel.precision) + + for kernel in kernels: + key = get_kernel_key(kernel) + precisions = d[key] + if 'dp' in precisions: + kernel.bytes_per_element = 16 + elif 'sp' in precisions: + kernel.bytes_per_element = 8 + elif 'hp' in precisions: + kernel.bytes_per_element = 4 + + return kernels + + def is_aot_rtc(meta): return not meta.runtime_compile @@ -116,8 +224,8 @@ def __str__(self): f = 'FFTKernel(' use_3steps_large_twd = getattr(self.function.meta, 'use_3steps_large_twd', None) - # assume half-precision needs the same thing as single - precision = 'sp' if self.function.meta.precision == 'half' else self.function.meta.precision + # assume half-precision needs the same large twiddle table as single + precision = 'sp' if self.function.meta.precision == 'hp' else self.function.meta.precision if use_3steps_large_twd is not None: f += str(use_3steps_large_twd[precision]) else: @@ -191,7 +299,7 @@ def generate_cpu_function_pool_pieces(functions, pp_functions, num_files): precisions = { 'sp': 'rocfft_precision_single', 'dp': 'rocfft_precision_double', - 'half': 'rocfft_precision_half', + 'hp': 'rocfft_precision_half', } var_kernel = Variable('kernel', 'FFTKernel') var_pp_kernel_1 = Variable('pp_kernel_1', 'FFTKernel') @@ -208,16 +316,17 @@ def generate_cpu_function_pool_pieces(functions, pp_functions, num_files): curr_func, curr_file = 0, 0 while curr_func < len(functions): f = functions[curr_func] - length, precision, scheme, transpose = f.meta.length, f.meta.precision, f.meta.scheme, f.meta.transpose + length, precision, arch, scheme, transpose = f.meta.length, f.meta.precision, f.meta.gcn_arch_name, f.meta.scheme, f.meta.transpose if isinstance(length, (int, str)): length = [length, 0] piece_contents[curr_file] += Assign(var_kernel, FFTKernel(f)) - key = Call( - name='FMKey', - arguments=ArgumentList(length[0], length[1], precisions[precision], - scheme, transpose or 'NONE', - 'kernel.get_kernel_config()')).inline() + key = Call(name='FMKey', + arguments=ArgumentList(length[0], length[1], + precisions[precision], scheme, + transpose or 'NONE', + 'kernel.get_kernel_config()', + ''.join(['"', arch, '"']))).inline() piece_contents[curr_file] += function_map.insert( key, var_kernel, 'std::get<0>(def_keys)', 'std::get<0>(function_maps)', f.meta.lds_size_bytes) @@ -236,7 +345,7 @@ def generate_cpu_function_pool_pieces(functions, pp_functions, num_files): # get first pp kernel f_pp_1 = pp_functions[counter_f_pp_1] - # PPFMKey entry needs two kernels with same length and precision, but different pp_current_dim + # PPFMKey entry needs two kernels with same length, precision, and arch, but different pp_current_dim counter_f_pp_2 = counter_f_pp_1 + 1 if counter_f_pp_2 >= len(pp_functions): break @@ -244,12 +353,16 @@ def generate_cpu_function_pool_pieces(functions, pp_functions, num_files): while counter_f_pp_2 < len(pp_functions): f_pp_2 = pp_functions[counter_f_pp_2] if (f_pp_1.meta.length == f_pp_2.meta.length - and f_pp_1.meta.precision == f_pp_2.meta.precision + and f_pp_1.meta.precision == f_pp_2.meta.precision and + f_pp_1.meta.gcn_arch_name == f_pp_2.meta.gcn_arch_name and f_pp_1.meta.pp_current_dim != f_pp_2.meta.pp_current_dim): break - if (f_pp_1.meta.length != f_pp_2.meta.length): - # we hit a new kernel with different length + if (f_pp_1.meta.length != f_pp_2.meta.length or ( + f_pp_1.meta.length == f_pp_2.meta.length and + (f_pp_1.meta.precision != f_pp_2.meta.precision or + f_pp_1.meta.gcn_arch_name != f_pp_2.meta.gcn_arch_name))): + # we hit a new kernel with different length/precision/arch # start next iteration looking for the next pair counter_f_pp_1 = counter_f_pp_2 skip_to_next_iter = True @@ -272,12 +385,14 @@ def generate_cpu_function_pool_pieces(functions, pp_functions, num_files): length = f_pp_1.meta.length precision = f_pp_1.meta.precision scheme = f_pp_1.meta.scheme + arch_name = f_pp_1.meta.gcn_arch_name key = Call(name='PPFMKey', arguments=ArgumentList( length[0], length[1], length[2], precisions[precision], scheme, 'pp_kernel_1.get_kernel_config()', - 'pp_kernel_2.get_kernel_config()')).inline() + 'pp_kernel_2.get_kernel_config()', + ''.join(['"', arch_name, '"']))).inline() piece_contents[curr_file] += function_map.insert_pp( key, var_pp_kernel_1, var_pp_kernel_2, 'std::get<1>(def_keys)', 'std::get<1>(function_maps)', f_pp_1.meta.lds_size_bytes) @@ -317,9 +432,13 @@ def kernel_name(ns): elif ns.scheme == 'CS_KERNEL_STOCKHAM_BLOCK_CR': postfix = '_sbcr' + postfix += f'_{ns.precision}' + if hasattr(ns, 'lds_size_bytes'): postfix += f'_lds{ns.lds_size_bytes}' + postfix += f'_{ns.gcn_arch_name}' + return f'rocfft_len{length}{postfix}' @@ -328,10 +447,8 @@ def list_small_kernels(): kernels1d = config_sbrr.sbrr_kernels kernels = [ - NS(**kernel.__dict__, - scheme='CS_KERNEL_STOCKHAM', - precision=['sp', 'dp'] if not hasattr(kernel, 'double_precision') - or kernel.double_precision else ['sp']) for kernel in kernels1d + NS(**kernel.__dict__, scheme='CS_KERNEL_STOCKHAM') + for kernel in kernels1d ] return kernels @@ -381,10 +498,7 @@ def list_2d_kernels(): expanded.extend( NS(**kernel.__dict__, scheme='CS_KERNEL_2D_SINGLE', - runtime_compile=True, - precision=['sp', 'dp'] if not hasattr(kernel, 'double_precision') - or kernel.double_precision else ['sp']) - for kernel in fused_2d_kernels) + runtime_compile=True) for kernel in fused_2d_kernels) return expanded @@ -412,7 +526,7 @@ def default_runtime_compile(kernels, default_val): ] -def generate_kernel_functions(kernels, precisions, launchers_json): +def generate_kernel_functions(precisions_type_dict, kernels, launchers_json): """Generate CPU functions used to populate function pool with each kernel in `kernels`, and its variations. """ @@ -449,7 +563,8 @@ def generate_kernel_functions(kernels, precisions, launchers_json): pp_current_dim = launcher.pp_current_dim pp_off_dim = launcher.pp_off_dim sbrc_transpose_type = launcher.sbrc_transpose_type - precision = 'dp' if launcher.double_precision else 'sp' + precision = precisions_type_dict[launcher.precision_type] + gcn_arch_name = launcher.gcn_arch_name runtime_compile = kernel.runtime_compile use_3steps_large_twd = getattr(kernel, 'use_3steps_large_twd', None) @@ -463,33 +578,30 @@ def generate_kernel_functions(kernels, precisions, launchers_json): threads_per_transform, 0 ] - precisions = [precision] - if precision == 'sp': - precisions.append('half') - for p in precisions: - f = Function(arguments=ArgumentList(data, back), - meta=NS(factors=factors, - length=length, - params=params, - precision=p, - runtime_compile=runtime_compile, - scheme=scheme, - workgroup_size=workgroup_size, - transforms_per_block=transforms_per_block, - threads_per_transform=tpt_list, - transpose=sbrc_transpose_type, - use_3steps_large_twd=use_3steps_large_twd, - lds_size_bytes=kernel.lds_size_bytes, - pp_child_scheme=pp_child_scheme, - pp_factors_curr=pp_factors_curr, - pp_factors_other=pp_factors_other, - pp_current_dim=pp_current_dim, - pp_off_dim=pp_off_dim)) - - if (scheme == 'CS_3D_PP'): - pp_kernel_functions.append(f) - else: - kernel_functions.append(f) + f = Function(arguments=ArgumentList(data, back), + meta=NS(factors=factors, + length=length, + params=params, + precision=precision, + gcn_arch_name=gcn_arch_name, + runtime_compile=runtime_compile, + scheme=scheme, + workgroup_size=workgroup_size, + transforms_per_block=transforms_per_block, + threads_per_transform=tpt_list, + transpose=sbrc_transpose_type, + use_3steps_large_twd=use_3steps_large_twd, + lds_size_bytes=kernel.lds_size_bytes, + pp_child_scheme=pp_child_scheme, + pp_factors_curr=pp_factors_curr, + pp_factors_other=pp_factors_other, + pp_current_dim=pp_current_dim, + pp_off_dim=pp_off_dim)) + + if (scheme == 'CS_3D_PP'): + pp_kernel_functions.append(f) + else: + kernel_functions.append(f) return kernel_functions, pp_kernel_functions @@ -502,7 +614,7 @@ def read_subprocess(proc_output, output): output[0] = json_string -def generate_kernels(kernels, precisions, stockham_gen): +def generate_kernels(precisions_dict, kernels, stockham_gen): """Generate and write kernels from the kernel list. Entries in the kernel list are simple namespaces. These are @@ -511,8 +623,6 @@ def generate_kernels(kernels, precisions, stockham_gen): A list of CPU functions is returned. """ - pre_enum = {'sp': 0, 'dp': 1} - # run stockham_gen to retrieve JSON output via stdout, used for additional kernel details proc = subprocess.Popen(args=[stockham_gen], stdin=subprocess.PIPE, @@ -534,24 +644,25 @@ def generate_kernels(kernels, precisions, stockham_gen): if kernel_idx < num_kernels: k = kernels[kernel_idx] - kernel_precisions = k.precision if hasattr( - k, 'precision') else precisions - # 2D single kernels always specify threads per transform if isinstance(k.length, list): proc.stdin.write(','.join([str(f) for f in k.factors[0]]) + " ") - proc.stdin.write(','.join([str(f) - for f in k.factors[1]]) + " ") - proc.stdin.write( - ','.join([str(pre_enum[pre]) - for pre in kernel_precisions]) + " ") + proc.stdin.write(','.join([str(f) for f in k.factors[1]])) + + proc.stdin.write(f' {str(precisions_dict[k.precision])}') + + proc.stdin.write(f' {k.gcn_arch_name}' + " ") + proc.stdin.write(','.join( [str(f) for f in k.threads_per_transform])) else: - proc.stdin.write(','.join([str(f) for f in k.factors]) + " ") - proc.stdin.write(','.join( - [str(pre_enum[pre]) for pre in kernel_precisions])) + proc.stdin.write(','.join([str(f) for f in k.factors])) + + proc.stdin.write(f' {precisions_dict[k.precision]}') + + proc.stdin.write(f' {k.gcn_arch_name}') + # 1D kernels might not, and need to default to 'uwide' threads_per_transform = getattr( k, 'threads_per_transform', { @@ -601,6 +712,7 @@ def generate_kernels(kernels, precisions, stockham_gen): proc.stdin.write(f' {k.scheme}') proc.stdin.write(f' {kernel_name(k)}') + proc.stdin.write(f' {k.bytes_per_element}') proc.stdin.write(f' {k.lds_size_bytes}') proc.stdin.write('\n') @@ -620,7 +732,11 @@ def generate_kernels(kernels, precisions, stockham_gen): json_string = json_result[0] kernel_launchers = json.loads(json_string) - return generate_kernel_functions(kernels, precisions, kernel_launchers) + # Invert precisions dict for easy lookup + precisions_type_dict = {v: k for k, v in precisions_dict.items()} + + return generate_kernel_functions(precisions_type_dict, kernels, + kernel_launchers) def cli(): @@ -635,9 +751,6 @@ def cli(): type=int, help='Number of files to generate for parallel compilation.') - list_parser = subparsers.add_parser( - 'list', help='List names of kernels that will be generated.') - generate_parser = subparsers.add_parser('generate', help='Generate kernels.') generate_parser.add_argument('stockham_gen', @@ -649,7 +762,11 @@ def cli(): assert (args.num_files > 0), 'Number of files for function_pool should be positive' - precisions = ['dp', 'sp'] + # List of supported precisions to build + # sp: single-precision + # dp: double-precision + # hp: half-precision + precisions_dict = {'sp': 0, 'dp': 1, 'hp': 2} # # kernel list @@ -663,17 +780,19 @@ def cli(): kernels += all_kernels + kernels_2d + kernel_3d_pp - # set default lds size (64k) on kernels if not specified - for k in kernels: - if not hasattr(k, 'lds_size_bytes'): - k.lds_size_bytes = 65536 + # + # merge kernel list with additional precision/arch entries if required + # + kernels = merge_kernel_list(kernels, list(precisions_dict)) - kernels = unique(kernels) + # + # set bytes_per_element based on highest precision for a given kernel key + # + kernels = set_bytes_per_element(kernels) # # set runtime compile # - kernels = default_runtime_compile(kernels, args.runtime_compile_default == 'ON') @@ -682,7 +801,7 @@ def cli(): # if args.command == 'generate': - functions, pp_functions = generate_kernels(kernels, precisions, + functions, pp_functions = generate_kernels(precisions_dict, kernels, args.stockham_gen) func_files = generate_cpu_function_pool_pieces(functions, pp_functions, args.num_files) diff --git a/projects/rocfft/library/src/device/kernels/configs/config_2d_single.py b/projects/rocfft/library/src/device/kernels/configs/config_2d_single.py index 181f63fb9aa..963379b364e 100644 --- a/projects/rocfft/library/src/device/kernels/configs/config_2d_single.py +++ b/projects/rocfft/library/src/device/kernels/configs/config_2d_single.py @@ -18,7 +18,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. -from kernels.configs import config_lds +from kernels.configs.config_arch import lds_config +from kernels.configs.config_arch import supported_arch from types import SimpleNamespace as NS # yapf: disable @@ -161,20 +162,20 @@ NS(length=[96,42], factors=[[6,16],[7,6]], threads_per_transform=[6,6], workgroup_size=576), # configs for 160KiB LDS - NS(length=[40,108], factors=[[8,5],[6,3,6]], threads_per_transform=[8,12], workgroup_size=240, lds_size_bytes=config_lds.LDS_160k), - NS(length=[108,40], factors=[[6,3,6],[8,5]], threads_per_transform=[12,8], workgroup_size=240, lds_size_bytes=config_lds.LDS_160k), - NS(length=[48,100], factors=[[8,6],[10,10]], threads_per_transform=[8,10], workgroup_size=240, lds_size_bytes=config_lds.LDS_160k), - NS(length=[100,48], factors=[[10,10],[8,6]], threads_per_transform=[10,8], workgroup_size=240, lds_size_bytes=config_lds.LDS_160k), - NS(length=[48,48], factors=[[8,6],[8,6]], threads_per_transform=[8,8], workgroup_size=256, lds_size_bytes=config_lds.LDS_160k), - NS(length=[48,96], factors=[[8,6],[4,6,4]], threads_per_transform=[8,8], workgroup_size=256, lds_size_bytes=config_lds.LDS_160k), - NS(length=[96,48], factors=[[4,6,4],[8,6]], threads_per_transform=[8,8], workgroup_size=256, lds_size_bytes=config_lds.LDS_160k), - NS(length=[50,100], factors=[[10,5],[10,10]], threads_per_transform=[10,10], workgroup_size=250, lds_size_bytes=config_lds.LDS_160k), - NS(length=[100,50], factors=[[10,10],[10,5]], threads_per_transform=[10,10], workgroup_size=250, lds_size_bytes=config_lds.LDS_160k), - NS(length=[50,96], factors=[[10,5],[4,6,4]], threads_per_transform=[10,16], double_precision=False, workgroup_size=240, lds_size_bytes=config_lds.LDS_160k), - NS(length=[54,108], factors=[[6,3,3],[6,3,6]], threads_per_transform=[6,12], double_precision=False, workgroup_size=243, lds_size_bytes=config_lds.LDS_160k), - NS(length=[108,54], factors=[[6,3,6],[6,3,3]], threads_per_transform=[12,6], double_precision=False, workgroup_size=243, lds_size_bytes=config_lds.LDS_160k), - NS(length=[64,128], factors=[[4,4,4],[4,8,4]], threads_per_transform=[8,16], double_precision=False, workgroup_size=512, lds_size_bytes=config_lds.LDS_160k), - NS(length=[128,64], factors=[[4,8,4],[4,4,4]], threads_per_transform=[16,8], double_precision=False, workgroup_size=512, lds_size_bytes=config_lds.LDS_160k), - NS(length=[96,96], factors=[[4,6,4],[4,6,4]], threads_per_transform=[8,8], workgroup_size=256, lds_size_bytes=config_lds.LDS_160k), - NS(length=[100,100], factors=[[10,10],[10,10]], threads_per_transform=[10,10], double_precision=False, workgroup_size=500, lds_size_bytes=config_lds.LDS_160k), + NS(length=[40,108], factors=[[8,5],[6,3,6]], threads_per_transform=[8,12], workgroup_size=240, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[108,40], factors=[[6,3,6],[8,5]], threads_per_transform=[12,8], workgroup_size=240, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[48,100], factors=[[8,6],[10,10]], threads_per_transform=[8,10], workgroup_size=240, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[100,48], factors=[[10,10],[8,6]], threads_per_transform=[10,8], workgroup_size=240, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[48,48], factors=[[8,6],[8,6]], threads_per_transform=[8,8], workgroup_size=256, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[48,96], factors=[[8,6],[4,6,4]], threads_per_transform=[8,8], workgroup_size=256, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[96,48], factors=[[4,6,4],[8,6]], threads_per_transform=[8,8], workgroup_size=256, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[50,100], factors=[[10,5],[10,10]], threads_per_transform=[10,10], workgroup_size=250, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[100,50], factors=[[10,10],[10,5]], threads_per_transform=[10,10], workgroup_size=250, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[50,96], factors=[[10,5],[4,6,4]], threads_per_transform=[10,16], precision=['sp', 'hp'], workgroup_size=240, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[54,108], factors=[[6,3,3],[6,3,6]], threads_per_transform=[6,12], precision=['sp', 'hp'], workgroup_size=243, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[108,54], factors=[[6,3,6],[6,3,3]], threads_per_transform=[12,6], precision=['sp', 'hp'], workgroup_size=243, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[64,128], factors=[[4,4,4],[4,8,4]], threads_per_transform=[8,16], precision=['sp', 'hp'], workgroup_size=512, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[128,64], factors=[[4,8,4],[4,4,4]], threads_per_transform=[16,8], precision=['sp', 'hp'], workgroup_size=512, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[96,96], factors=[[4,6,4],[4,6,4]], threads_per_transform=[8,8], workgroup_size=256, lds_size_bytes=lds_config.SIZE_160KiB.value), + NS(length=[100,100], factors=[[10,10],[10,10]], threads_per_transform=[10,10], precision=['sp', 'hp'], workgroup_size=500, lds_size_bytes=lds_config.SIZE_160KiB.value), ] diff --git a/projects/rocfft/library/src/device/kernels/configs/config_lds.py b/projects/rocfft/library/src/device/kernels/configs/config_arch.py similarity index 69% rename from projects/rocfft/library/src/device/kernels/configs/config_lds.py rename to projects/rocfft/library/src/device/kernels/configs/config_arch.py index 52dd1a558af..850c969c073 100644 --- a/projects/rocfft/library/src/device/kernels/configs/config_lds.py +++ b/projects/rocfft/library/src/device/kernels/configs/config_arch.py @@ -18,4 +18,27 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. -LDS_160k = 160 * 1024 +from enum import Enum + + +class lds_config(Enum): + SIZE_64KiB = 64 * 1024 + SIZE_160KiB = 160 * 1024 + + +class supported_arch(Enum): + GFX_GENERIC = "gfx_generic" + GFX_803 = "gfx803" + GFX_900 = "gfx900" + GFX_906 = "gfx906" + GFX_908 = "gfx908" + GFX_90A = "gfx90a" + GFX_942 = "gfx942" + GFX_950 = "gfx950" + GFX_1030 = "gfx1030" + GFX_1100 = "gfx1100" + GFX_1101 = "gfx1101" + GFX_1102 = "gfx1102" + GFX_1151 = "gfx1151" + GFX_1200 = "gfx1200" + GFX_1201 = "gfx1201" diff --git a/projects/rocfft/library/src/device/kernels/configs/config_pp_3d.py b/projects/rocfft/library/src/device/kernels/configs/config_pp_3d.py index 7ce240a409e..54d619a33da 100644 --- a/projects/rocfft/library/src/device/kernels/configs/config_pp_3d.py +++ b/projects/rocfft/library/src/device/kernels/configs/config_pp_3d.py @@ -18,7 +18,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. -from kernels.configs import config_lds +from kernels.configs.config_arch import lds_config +from kernels.configs.config_arch import supported_arch from types import SimpleNamespace as NS # yapf: disable diff --git a/projects/rocfft/library/src/device/kernels/configs/config_sbcc.py b/projects/rocfft/library/src/device/kernels/configs/config_sbcc.py index 6eeed173374..3720ce793b2 100644 --- a/projects/rocfft/library/src/device/kernels/configs/config_sbcc.py +++ b/projects/rocfft/library/src/device/kernels/configs/config_sbcc.py @@ -18,7 +18,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. -from kernels.configs import config_lds +from kernels.configs.config_arch import lds_config +from kernels.configs.config_arch import supported_arch from types import SimpleNamespace as NS # Note: Default direct_to_from_reg is True diff --git a/projects/rocfft/library/src/device/kernels/configs/config_sbcr.py b/projects/rocfft/library/src/device/kernels/configs/config_sbcr.py index 625d2a1d217..c926bdb4f67 100644 --- a/projects/rocfft/library/src/device/kernels/configs/config_sbcr.py +++ b/projects/rocfft/library/src/device/kernels/configs/config_sbcr.py @@ -18,7 +18,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. -from kernels.configs import config_lds +from kernels.configs.config_arch import lds_config +from kernels.configs.config_arch import supported_arch from types import SimpleNamespace as NS # NB: diff --git a/projects/rocfft/library/src/device/kernels/configs/config_sbrc.py b/projects/rocfft/library/src/device/kernels/configs/config_sbrc.py index 38ce5f0f37c..bd53541ddeb 100644 --- a/projects/rocfft/library/src/device/kernels/configs/config_sbrc.py +++ b/projects/rocfft/library/src/device/kernels/configs/config_sbrc.py @@ -18,7 +18,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. -from kernels.configs import config_lds +from kernels.configs.config_arch import lds_config +from kernels.configs.config_arch import supported_arch from types import SimpleNamespace as NS # for SBRC, if direct_to_from_reg is True, we do store-from-reg, but will not do load-to-reg diff --git a/projects/rocfft/library/src/device/kernels/configs/config_sbrr.py b/projects/rocfft/library/src/device/kernels/configs/config_sbrr.py index 2d495a775f9..99521991615 100644 --- a/projects/rocfft/library/src/device/kernels/configs/config_sbrr.py +++ b/projects/rocfft/library/src/device/kernels/configs/config_sbrr.py @@ -18,7 +18,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. -from kernels.configs import config_lds +from kernels.configs.config_arch import lds_config +from kernels.configs.config_arch import supported_arch from types import SimpleNamespace as NS # Note: Default half_lds is True and default direct_to_from_reg is True as well. @@ -482,27 +483,27 @@ NS(length=4000, workgroup_size=256, threads_per_transform=200, factors=(10, 10, 10, 4), runtime_compile=True), NS(length=4050, workgroup_size=256, threads_per_transform=135, factors=(10, 5, 3, 3, 3, 3), runtime_compile=True), NS(length=4096, workgroup_size=256, threads_per_transform=256, factors=(16, 16, 16), runtime_compile=True), - NS(length=4704, workgroup_size=256, threads_per_transform=224, factors=(8, 4, 7, 7, 3), double_precision=False, runtime_compile=True), - NS(length=5488, workgroup_size=256, threads_per_transform=196, factors=(7, 4, 7, 4, 7), double_precision=False, runtime_compile=True), - NS(length=6144, workgroup_size=512, threads_per_transform=512, factors=(16, 4, 8, 3, 4), double_precision=False, runtime_compile=True), - NS(length=6561, workgroup_size=256, threads_per_transform=243, factors=(3, 3, 3, 3, 3, 3, 3, 3), double_precision=False, runtime_compile=True), - NS(length=8192, workgroup_size=512, threads_per_transform=512, factors=(16, 4, 4, 4, 8), double_precision=False, runtime_compile=True), + NS(length=4704, workgroup_size=256, threads_per_transform=224, factors=(8, 4, 7, 7, 3), precision=['sp', 'hp'], runtime_compile=True), + NS(length=5488, workgroup_size=256, threads_per_transform=196, factors=(7, 4, 7, 4, 7), precision=['sp', 'hp'], runtime_compile=True), + NS(length=6144, workgroup_size=512, threads_per_transform=512, factors=(16, 4, 8, 3, 4), precision=['sp', 'hp'], runtime_compile=True), + NS(length=6561, workgroup_size=256, threads_per_transform=243, factors=(3, 3, 3, 3, 3, 3, 3, 3), precision=['sp', 'hp'], runtime_compile=True), + NS(length=8192, workgroup_size=512, threads_per_transform=512, factors=(16, 4, 4, 4, 8), precision=['sp', 'hp'], runtime_compile=True), # configs for 160KiB LDS - NS(length=4704, workgroup_size=256, threads_per_transform=224, factors=(8, 4, 7, 7, 3), lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=5488, workgroup_size=256, threads_per_transform=196, factors=(7, 4, 7, 4, 7), lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=6144, workgroup_size=384, threads_per_transform=256, factors=(4, 8, 8, 8, 3), lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=6561, workgroup_size=256, threads_per_transform=243, factors=(3, 3, 3, 3, 3, 3, 3, 3), lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=8192, workgroup_size=512, threads_per_transform=512, factors=(16, 4, 16, 8), lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=9216, workgroup_size=512, threads_per_transform=512, factors=(4, 8, 4, 4, 3, 6), lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=10000, workgroup_size=512, threads_per_transform=500, factors=(4, 5, 5, 10, 10), lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=10240, workgroup_size=512, threads_per_transform=512, factors=(8, 4, 4, 4, 5, 4), lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=10752, workgroup_size=512, threads_per_transform=512, factors=(4, 16, 8, 7, 3), double_precision=False, lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=11200, workgroup_size=512, threads_per_transform=448, factors=(4, 7, 5, 16, 5), double_precision=False, lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=12288, workgroup_size=512, threads_per_transform=512, factors=(8, 8, 4, 6, 8), double_precision=False, lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=16384, workgroup_size=512, threads_per_transform=512, factors=(8, 16, 4, 8, 4), double_precision=False, lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=16807, workgroup_size=384, threads_per_transform=343, factors=(7, 7, 7, 7, 7), double_precision=False, lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=18816, workgroup_size=512, threads_per_transform=448, factors=(8, 8, 7, 7, 6), double_precision=False, lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=19200, workgroup_size=512, threads_per_transform=480, factors=(8, 10, 8, 5, 6), double_precision=False, lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), - NS(length=20480, workgroup_size=512, threads_per_transform=512, factors=(4, 4, 16, 10, 8), double_precision=False, lds_size_bytes=config_lds.LDS_160k, runtime_compile=True), + NS(length=4704, workgroup_size=256, threads_per_transform=224, factors=(8, 4, 7, 7, 3), lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=5488, workgroup_size=256, threads_per_transform=196, factors=(7, 4, 7, 4, 7), lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=6144, workgroup_size=384, threads_per_transform=256, factors=(4, 8, 8, 8, 3), lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=6561, workgroup_size=256, threads_per_transform=243, factors=(3, 3, 3, 3, 3, 3, 3, 3), lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=8192, workgroup_size=512, threads_per_transform=512, factors=(16, 4, 16, 8), lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=9216, workgroup_size=512, threads_per_transform=512, factors=(4, 8, 4, 4, 3, 6), lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=10000, workgroup_size=512, threads_per_transform=500, factors=(4, 5, 5, 10, 10), lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=10240, workgroup_size=512, threads_per_transform=512, factors=(8, 4, 4, 4, 5, 4), lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=10752, workgroup_size=512, threads_per_transform=512, factors=(4, 16, 8, 7, 3), precision=['sp', 'hp'], lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=11200, workgroup_size=512, threads_per_transform=448, factors=(4, 7, 5, 16, 5), precision=['sp', 'hp'], lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=12288, workgroup_size=512, threads_per_transform=512, factors=(8, 8, 4, 6, 8), precision=['sp', 'hp'], lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=16384, workgroup_size=512, threads_per_transform=512, factors=(8, 16, 4, 8, 4), precision=['sp', 'hp'], lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=16807, workgroup_size=384, threads_per_transform=343, factors=(7, 7, 7, 7, 7), precision=['sp', 'hp'], lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=18816, workgroup_size=512, threads_per_transform=448, factors=(8, 8, 7, 7, 6), precision=['sp', 'hp'], lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=19200, workgroup_size=512, threads_per_transform=480, factors=(8, 10, 8, 5, 6), precision=['sp', 'hp'], lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), + NS(length=20480, workgroup_size=512, threads_per_transform=512, factors=(4, 4, 16, 10, 8), precision=['sp', 'hp'], lds_size_bytes=lds_config.SIZE_160KiB.value, runtime_compile=True), ] diff --git a/projects/rocfft/library/src/include/function_map_key.h b/projects/rocfft/library/src/include/function_map_key.h index bf85c67a54c..3065a153d19 100644 --- a/projects/rocfft/library/src/include/function_map_key.h +++ b/projects/rocfft/library/src/include/function_map_key.h @@ -289,10 +289,12 @@ struct FMKeyBase { FMKeyBase(std::array lengths, rocfft_precision precision, - ComputeScheme scheme = CS_NONE) + ComputeScheme scheme = CS_NONE, + std::string gcn_arch_name = get_curr_gcn_arch_name()) : lengths(lengths) , precision(precision) , scheme(scheme) + , gcn_arch_name(gcn_arch_name) { } @@ -310,6 +312,7 @@ struct FMKeyBase rocfft_precision precision; ComputeScheme scheme; + std::string gcn_arch_name; }; // length, precision, scheme are theose fundemantal information of a kernel; @@ -340,8 +343,9 @@ struct FMKey : public FMKeyBase rocfft_precision precision, ComputeScheme scheme = CS_KERNEL_STOCKHAM, SBRC_TRANSPOSE_TYPE transpose = NONE, - KernelConfig kernel_config = KernelConfig::EmptyConfig()) - : FMKeyBase({length0, 0, 0}, precision, scheme) + KernelConfig kernel_config = KernelConfig::EmptyConfig(), + std::string gcn_arch_name = get_curr_gcn_arch_name()) + : FMKeyBase({length0, 0, 0}, precision, scheme, gcn_arch_name) , sbrcTrans(transpose) , kernel_config(kernel_config) @@ -354,8 +358,9 @@ struct FMKey : public FMKeyBase rocfft_precision precision, ComputeScheme scheme = CS_KERNEL_2D_SINGLE, SBRC_TRANSPOSE_TYPE transpose = NONE, - KernelConfig kernel_config = KernelConfig::EmptyConfig()) - : FMKeyBase({length0, length1, 0}, precision, scheme) + KernelConfig kernel_config = KernelConfig::EmptyConfig(), + std::string gcn_arch_name = get_curr_gcn_arch_name()) + : FMKeyBase({length0, length1, 0}, precision, scheme, gcn_arch_name) , sbrcTrans(transpose) , kernel_config(kernel_config) { @@ -365,9 +370,13 @@ struct FMKey : public FMKeyBase bool operator==(const FMKey& rhs) const { - return std::tie(lengths, precision, scheme, sbrcTrans, kernel_config) - == std::tie( - rhs.lengths, rhs.precision, rhs.scheme, rhs.sbrcTrans, rhs.kernel_config); + return std::tie(lengths, precision, scheme, sbrcTrans, kernel_config, gcn_arch_name) + == std::tie(rhs.lengths, + rhs.precision, + rhs.scheme, + rhs.sbrcTrans, + rhs.kernel_config, + rhs.gcn_arch_name); } bool operator!=(const FMKey& rhs) const @@ -377,8 +386,13 @@ struct FMKey : public FMKeyBase bool operator<(const FMKey& rhs) const { - return std::tie(lengths, precision, scheme, sbrcTrans, kernel_config) - < std::tie(rhs.lengths, rhs.precision, rhs.scheme, rhs.sbrcTrans, rhs.kernel_config); + return std::tie(lengths, precision, scheme, sbrcTrans, kernel_config, gcn_arch_name) + < std::tie(rhs.lengths, + rhs.precision, + rhs.scheme, + rhs.sbrcTrans, + rhs.kernel_config, + rhs.gcn_arch_name); } static FMKey EmptyFMKey() @@ -434,7 +448,8 @@ struct ToString str += FieldDescriptor().describe("sbrc_trans", PrintSBRCTransposeType(value.sbrcTrans)) + ","; - str += FieldDescriptor().describe("kernelConfig", value.kernel_config); + str += FieldDescriptor().describe("kernelConfig", value.kernel_config) + ","; + str += FieldDescriptor().describe("gcn_arch_name", value.gcn_arch_name); str += "}"; return str; } @@ -448,18 +463,21 @@ struct FromString std::vector len; std::string precStr, schemeStr, sbrcTransStr; KernelConfig config; + std::string gcn_arch_name; VectorFieldParser().parse("lengths", len, current); FieldParser().parse("precision", precStr, current); FieldParser().parse("scheme", schemeStr, current); FieldParser().parse("sbrc_trans", sbrcTransStr, current); FieldParser().parse("kernelConfig", config, current); + FieldParser().parse("gcn_arch_name", gcn_arch_name, current); ret.lengths = {len[0], len[1]}; ret.precision = StrToPrecision(precStr); ret.scheme = StrToComputeScheme(schemeStr); ret.sbrcTrans = StrToSBRCTransType(sbrcTransStr); ret.kernel_config = config; + ret.gcn_arch_name = gcn_arch_name; } }; @@ -475,6 +493,7 @@ struct SimpleHash h ^= std::hash{}(p.scheme); h ^= std::hash{}(p.sbrcTrans); h ^= std::hash{}(p.kernel_config); + h ^= std::hash{}(p.gcn_arch_name); return h; } @@ -503,8 +522,9 @@ struct PPFMKey : public FMKeyBase rocfft_precision precision, ComputeScheme scheme = CS_3D_PP, KernelConfig kernel_config_1 = KernelConfig::EmptyConfig(), - KernelConfig kernel_config_2 = KernelConfig::EmptyConfig()) - : FMKeyBase({length0, length1, length2}, precision, scheme) + KernelConfig kernel_config_2 = KernelConfig::EmptyConfig(), + std::string gcn_arch_name = get_curr_gcn_arch_name()) + : FMKeyBase({length0, length1, length2}, precision, scheme, gcn_arch_name) , kernel_config_1(kernel_config_1) , kernel_config_2(kernel_config_2) { @@ -514,12 +534,13 @@ struct PPFMKey : public FMKeyBase bool operator==(const PPFMKey& rhs) const { - return std::tie(lengths, precision, scheme, kernel_config_1, kernel_config_2) + return std::tie(lengths, precision, scheme, kernel_config_1, kernel_config_2, gcn_arch_name) == std::tie(rhs.lengths, rhs.precision, rhs.scheme, rhs.kernel_config_1, - rhs.kernel_config_2); + rhs.kernel_config_2, + rhs.gcn_arch_name); } bool operator!=(const PPFMKey& rhs) const @@ -529,12 +550,13 @@ struct PPFMKey : public FMKeyBase bool operator<(const PPFMKey& rhs) const { - return std::tie(lengths, precision, scheme, kernel_config_1, kernel_config_2) + return std::tie(lengths, precision, scheme, kernel_config_1, kernel_config_2, gcn_arch_name) < std::tie(rhs.lengths, rhs.precision, rhs.scheme, rhs.kernel_config_1, - rhs.kernel_config_2); + rhs.kernel_config_2, + rhs.gcn_arch_name); } static PPFMKey EmptyPPFMKey() @@ -562,6 +584,7 @@ struct SimpleHashPP h ^= std::hash{}(p.scheme); h ^= std::hash{}(p.kernel_config_1); h ^= std::hash{}(p.kernel_config_2); + h ^= std::hash{}(p.gcn_arch_name); return h; } diff --git a/projects/rocfft/library/src/include/function_pool.h b/projects/rocfft/library/src/include/function_pool.h index 14ed7d063e9..347ba1729c0 100644 --- a/projects/rocfft/library/src/include/function_pool.h +++ b/projects/rocfft/library/src/include/function_pool.h @@ -24,6 +24,7 @@ #define FUNCTION_POOL_H #include "../../../shared/arithmetic.h" +#include "../../../shared/device_properties.h" #include "../../../shared/rocfft_complex.h" #include "../device/kernels/common.h" #include "function_map_key.h" @@ -38,7 +39,8 @@ inline std::string PrintMissingKernelInfoBase(const FMKeyBase& key) msg << "Kernel not found: \n" << "\tlength: " << key.lengths[0] << "," << key.lengths[1] << "\n" << "\tprecision: " << key.precision << "\n" - << "\tscheme: " << PrintScheme(key.scheme) << "\n"; + << "\tscheme: " << PrintScheme(key.scheme) << "\n" + << "\tGCN Arch Name: " << key.gcn_arch_name << "\n"; return msg.str(); } @@ -211,26 +213,30 @@ class function_pool return best; } - const FMKey& get_actual_key(const FMKey& key) const + template + const TKey& get_actual_key(const TKey& key, TKeyPool& pool) const { // - for keys that we are querying with no/empty kernel-config, actually we are refering to // the default kernel-configs in kernel-generator.py. So get the actual keys to look-up // the pool. - // - if not in the def_key_pool, then we simply use itself (for dynamically added kernel) - auto it = find_key_in_map(def_key_pool, key); - if(it != def_key_pool.end()) - return it->second; - else - return key; - } + // - if not in the pool, then we simply use itself (for dynamically added kernel) - const PPFMKey& get_actual_key(const PPFMKey& key) const - { - auto it = find_key_in_map(def_pp_key_pool, key); - if(it != def_pp_key_pool.end()) + // First attempt an exact match with the given architecture in gcn_arch_name if possible + auto it = find_key_in_map(pool, key); + if(it != pool.end()) return it->second; else - return key; + { + // If a match is not found, try it with the generic arch kernel + auto key_copy = key; + key_copy.gcn_arch_name = generic_gcn_arch_name; + + auto it = find_key_in_map(pool, key_copy); + if(it != pool.end()) + return it->second; + else + return key; + } } public: @@ -283,13 +289,13 @@ class function_pool bool has_function(const FMKey& key) const { - auto real_key = get_actual_key(key); + auto real_key = get_actual_key(key, def_key_pool); return find_key_in_map(function_map, real_key) != function_map.end(); } bool has_function(const PPFMKey& key) const { - auto real_key = get_actual_key(key); + auto real_key = get_actual_key(key, def_pp_key_pool); return find_key_in_map(pp_function_map, real_key) != pp_function_map.end(); } @@ -328,7 +334,7 @@ class function_pool FFTKernel get_kernel(const FMKey& key) const { - auto real_key = get_actual_key(key); + auto real_key = get_actual_key(key, def_key_pool); auto it = find_key_in_map(function_map, real_key); if(it == function_map.end()) throw std::out_of_range("kernel not found in map"); @@ -337,7 +343,7 @@ class function_pool FFTKernel get_kernel(const PPFMKey& key, ComputeScheme scheme) const { - auto real_key = get_actual_key(key); + auto real_key = get_actual_key(key, def_pp_key_pool); auto it = find_key_in_map(pp_function_map, real_key); if(it == pp_function_map.end()) throw std::out_of_range("kernel not found in partial-pass map"); @@ -395,8 +401,12 @@ static void insert_default_entry(const FMKey& def_key, FPMap& function_map, size_t lds_size_bytes) { - FMKey def_key_with_lds = def_key; - def_key_with_lds.lds_size_bytes = lds_size_bytes; + FMKey def_key_with_lds = def_key; + + // Specifically add the current device's max LDS size if not a generic arch entry + def_key_with_lds.lds_size_bytes = def_key.gcn_arch_name == generic_gcn_arch_name + ? lds_size_bytes + : get_curr_device_prop().sharedMemPerBlock; // simple_key means the same thing as def_key, but we just remove kernel-config // so we don't need to know the exact config when we're lookin' for the default kernel @@ -416,8 +426,12 @@ static void insert_default_entry(const PPFMKey& def_key, PPFPMap& function_map, size_t lds_size_bytes) { - PPFMKey def_key_with_lds = def_key; - def_key_with_lds.lds_size_bytes = lds_size_bytes; + PPFMKey def_key_with_lds = def_key; + + // Specifically add the current device's max LDS size if not a generic arch entry + def_key_with_lds.lds_size_bytes = def_key.gcn_arch_name == generic_gcn_arch_name + ? lds_size_bytes + : get_curr_device_prop().sharedMemPerBlock; PPFMKey simple_key(def_key_with_lds); diff --git a/projects/rocfft/library/src/rocfft_aot_helper.cpp b/projects/rocfft/library/src/rocfft_aot_helper.cpp index 5468e211f1c..300baebc16c 100644 --- a/projects/rocfft/library/src/rocfft_aot_helper.cpp +++ b/projects/rocfft/library/src/rocfft_aot_helper.cpp @@ -245,7 +245,8 @@ void build_stockham_function_pool(CompileQueue& queue) StockhamGeneratorSpecs specs{factors, {}, - {static_cast(precision)}, + static_cast(precision), + get_curr_gcn_arch_name(), static_cast(i.second.workgroup_size), PrintScheme(scheme)}; specs.threads_per_transform = i.second.threads_per_transform[0]; @@ -663,7 +664,8 @@ void build_solution_kernels(CompileQueue& queue) StockhamGeneratorSpecs specs{factors, {}, - {static_cast(precision)}, + static_cast(precision), + get_curr_gcn_arch_name(), static_cast(config.workgroup_size), PrintScheme(scheme)}; specs.threads_per_transform = config.threads_per_transform[0]; diff --git a/projects/rocfft/library/src/rocfft_kernel_config_search.cpp b/projects/rocfft/library/src/rocfft_kernel_config_search.cpp index 7b6e4141252..5fa68d3ef78 100644 --- a/projects/rocfft/library/src/rocfft_kernel_config_search.cpp +++ b/projects/rocfft/library/src/rocfft_kernel_config_search.cpp @@ -31,6 +31,7 @@ #include "../../shared/CLI11.hpp" #include "../../shared/arithmetic.h" +#include "../../shared/device_properties.h" #include "../../shared/gpubuf.h" #include "../../shared/hip_object_wrapper.h" #include "device/generator/stockham_gen.h" @@ -159,7 +160,8 @@ std::string test_kernel_src(const std::string& kernel_name, { StockhamGeneratorSpecs specs{factorization, {}, - {static_cast(rocfft_precision_single)}, + static_cast(rocfft_precision_single), + get_curr_gcn_arch_name(), wgs, PrintScheme(compute_scheme)}; diff --git a/projects/rocfft/library/src/rtc_stockham_kernel.cpp b/projects/rocfft/library/src/rtc_stockham_kernel.cpp index 543be5a551f..ad4e5be59f6 100644 --- a/projects/rocfft/library/src/rtc_stockham_kernel.cpp +++ b/projects/rocfft/library/src/rtc_stockham_kernel.cpp @@ -74,11 +74,12 @@ RTCKernel::RTCGenerator RTCKernelStockham::generate_from_node(const LeafNode& std::vector factors; std::copy(kernel->factors.begin(), kernel->factors.end(), std::back_inserter(factors)); - std::vector precisions = {static_cast(node.precision)}; + auto precision = static_cast(node.precision); specs.emplace(factors, std::vector(), - precisions, + precision, + get_curr_gcn_arch_name(), static_cast(kernel->workgroup_size), PrintScheme(node.scheme)); specs->threads_per_transform = kernel->threads_per_transform[0]; @@ -105,7 +106,8 @@ RTCKernel::RTCGenerator RTCKernelStockham::generate_from_node(const LeafNode& std::vector factors1d; std::vector factors2d; - std::vector precisions = {static_cast(node.precision)}; + + auto precision = static_cast(node.precision); // need to break down factors into first dim and second dim size_t len0_remain = node.length[0]; @@ -124,7 +126,8 @@ RTCKernel::RTCGenerator RTCKernelStockham::generate_from_node(const LeafNode& specs.emplace(factors1d, factors2d, - precisions, + precision, + get_curr_gcn_arch_name(), static_cast(kernel->workgroup_size), PrintScheme(node.scheme)); specs->threads_per_transform = kernel->threads_per_transform[0]; @@ -133,7 +136,8 @@ RTCKernel::RTCGenerator RTCKernelStockham::generate_from_node(const LeafNode& specs2d.emplace(factors2d, factors1d, - precisions, + precision, + get_curr_gcn_arch_name(), static_cast(kernel->workgroup_size), PrintScheme(node.scheme)); specs2d->threads_per_transform = kernel->threads_per_transform[1]; diff --git a/projects/rocfft/shared/device_properties.h b/projects/rocfft/shared/device_properties.h index becc4c264e3..4d4c001a55a 100644 --- a/projects/rocfft/shared/device_properties.h +++ b/projects/rocfft/shared/device_properties.h @@ -26,6 +26,10 @@ #include #include +// Generic GCN arch name for configuration entries that +// do not report a specific name in kernel_generator.py +static const std::string generic_gcn_arch_name("gfx_generic"); + // get device properties static hipDeviceProp_t get_curr_device_prop() { @@ -41,6 +45,29 @@ static hipDeviceProp_t get_curr_device_prop() return prop; } +// Get current device GCN arch name without any suffix after ':' +// If no devices are found, return empty string +static std::string get_curr_gcn_arch_name() +{ + int dev_count = 0; + auto ret = hipGetDeviceCount(&dev_count); + if(ret != hipSuccess || dev_count == 0) + return ""; + + auto dev_prop = get_curr_device_prop(); + std::string arch_name_full = dev_prop.gcnArchName; + + if(arch_name_full.empty()) + throw std::runtime_error("Device GCN arch name is empty."); + + auto delimiter = ":"; + auto pos = arch_name_full.find(delimiter); + if(pos != std::string::npos) + arch_name_full.erase(pos); + + return arch_name_full; +} + // check that the given grid/block dims will fit into the limits in // the device properties. throws std::runtime_error if the limits // are exceeded.