Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
218573d
- Add ability to have per precision kernel configuration entries.
eng-flavio-teixeira Nov 3, 2025
e64821d
- Change bytes_per_element calculation to match what was previously …
eng-flavio-teixeira Nov 5, 2025
2512301
- Add support for arch specific entries in the function pool.
eng-flavio-teixeira Nov 24, 2025
15761c2
- Further changes for arch support in the function pool.
eng-flavio-teixeira Nov 27, 2025
91948d0
Merge commit '85ea1d36a78c3fca8485cbf6833497cfd18596ac' into function…
eng-flavio-teixeira Nov 27, 2025
7217193
- CHANGELOG.
eng-flavio-teixeira Nov 27, 2025
326090e
- Remove suffix from get_curr_gcn_arch_name().
eng-flavio-teixeira Dec 1, 2025
8a9ee5b
- Address review suggestion.
eng-flavio-teixeira Dec 4, 2025
7b0d134
- Add missing gcn_arch_name in kernel_name()
eng-flavio-teixeira Dec 8, 2025
7055d54
- Fix gcn_arch_name handling for partial-pass kernels.
eng-flavio-teixeira Dec 8, 2025
554633e
- Fix partial-pass kernel check.
eng-flavio-teixeira Dec 9, 2025
24db868
- Add comment.
eng-flavio-teixeira Dec 10, 2025
21ba23e
Merge branch 'develop' into function_pool_device_arch
eng-flavio-teixeira Dec 10, 2025
03fef0c
Merge branch 'develop' into function_pool_device_arch
eng-flavio-teixeira Dec 10, 2025
ea12d45
- No need to put device arch name in StockhamGeneratorSpecs for aot …
eng-flavio-teixeira Dec 10, 2025
8467235
- Remove no longer needed calls to get_curr_gcn_arch_name to constru…
eng-flavio-teixeira Dec 10, 2025
a429b2b
- Remove header include.
eng-flavio-teixeira Dec 10, 2025
6114b2f
- Handle the case where FMKey is constructed without a visible HIP d…
eng-flavio-teixeira Dec 10, 2025
0980785
Revert " - No need to put device arch name in StockhamGeneratorSpecs …
eng-flavio-teixeira Dec 10, 2025
21e66ae
Revert " - Remove no longer needed calls to get_curr_gcn_arch_name to…
eng-flavio-teixeira Dec 10, 2025
f1e5984
Revert " - Remove header include."
eng-flavio-teixeira Dec 10, 2025
e22d8fa
Merge branch 'develop' into function_pool_device_arch
eng-flavio-teixeira Dec 10, 2025
36a30b7
- Take hip return into account.
eng-flavio-teixeira Dec 10, 2025
19ca6d6
Merge branch 'function_pool_device_arch' of https://github.com/eng-fl…
eng-flavio-teixeira Dec 10, 2025
5210b5c
Merge branch 'develop' into function_pool_device_arch
eng-flavio-teixeira Dec 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions projects/rocfft/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ Documentation for rocFFT is available at

## rocFFT 1.0.36 (unreleased)

### Added

* Added support for per precision and architecture kernel configuration entries in the library.

### Optimized

* Removed a potential unnecessary global transpose operation from MPI 3D multi-GPU pencil decompositions.
Expand Down
2 changes: 1 addition & 1 deletion projects/rocfft/library/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ set( kgen_logic_files
# python code that decides kernel parameters
${CMAKE_SOURCE_DIR}/library/src/device/kernel-generator.py
${CMAKE_SOURCE_DIR}/library/src/device/generator.py
${CMAKE_SOURCE_DIR}/library/src/device/kernels/configs/config_lds.py
${CMAKE_SOURCE_DIR}/library/src/device/kernels/configs/config_arch.py
${CMAKE_SOURCE_DIR}/library/src/device/kernels/configs/config_sbrr.py
${CMAKE_SOURCE_DIR}/library/src/device/kernels/configs/config_sbcc.py
${CMAKE_SOURCE_DIR}/library/src/device/kernels/configs/config_sbcr.py
Expand Down
139 changes: 77 additions & 62 deletions projects/rocfft/library/src/device/generator/stockham_gen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
using namespace std::placeholders;

#include "../../../../shared/arithmetic.h"
#include "../../../../shared/device_properties.h"
#include "../../../../shared/precision_type.h"
#include "generator.h"
#include "stockham_gen.h"
Expand Down Expand Up @@ -52,7 +53,8 @@ struct GeneratedLauncher
const std::vector<unsigned int>& pp_factors_other,
const unsigned int& pp_current_dim,
const unsigned int& pp_off_dim,
bool double_precision,
const unsigned int& precision_type,
const std::string& gcn_arch_name,
Comment thread
eng-flavio-teixeira marked this conversation as resolved.
const std::string& sbrc_type,
const std::string& sbrc_transpose_type)
: scheme(scheme)
Expand All @@ -69,7 +71,9 @@ struct GeneratedLauncher
, direct_to_from_reg(kernel.direct_to_from_reg)
, sbrc_type(sbrc_type)
, sbrc_transpose_type(sbrc_transpose_type)
, double_precision(double_precision)
, precision_type(precision_type)
, gcn_arch_name(gcn_arch_name)

{
}

Expand All @@ -90,7 +94,10 @@ struct GeneratedLauncher
// SBRC transpose type
std::string sbrc_type;
std::string sbrc_transpose_type;
bool double_precision;

unsigned int precision_type;

std::string gcn_arch_name;

// output a json object that the python generator can parse to know
// how to build the function pool
Expand Down Expand Up @@ -129,7 +136,8 @@ struct GeneratedLauncher
add_member("direct_to_from_reg", direct_to_from_reg ? "true" : "false");
add_member("sbrc_type", quote_str(sbrc_type));
add_member("sbrc_transpose_type", quote_str(sbrc_transpose_type));
add_member("double_precision", double_precision ? "true" : "false");
add_member("precision_type", std::to_string(precision_type));
add_member("gcn_arch_name", quote_str(gcn_arch_name));
add_member("pp_child_scheme", quote_str(pp_child_scheme));
add_member("pp_factors_curr", vec_to_list(pp_factors_curr));
add_member("pp_factors_other", vec_to_list(pp_factors_other));
Expand All @@ -149,31 +157,31 @@ struct LaunchSuffix
std::string sbrc_transpose_type;
};

void make_launcher(const std::vector<unsigned int>& precision_types,
void make_launcher(const unsigned int& precision_type,
const std::vector<LaunchSuffix>& launcher_suffixes,
StockhamKernel& kernel,
const std::string& gcn_arch_name,
const std::string& pp_child_scheme,
const std::vector<unsigned int>& pp_factors_curr,
const std::vector<unsigned int>& pp_factors_other,
const unsigned int& pp_current_dim,
const unsigned int& pp_off_dim,
std::vector<GeneratedLauncher>& generated_launchers)
{
for(auto precision_type : precision_types)

for(auto&& launcher : launcher_suffixes)
{
for(auto&& launcher : launcher_suffixes)
{
generated_launchers.emplace_back(kernel,
launcher.scheme,
pp_child_scheme,
pp_factors_curr,
pp_factors_other,
pp_current_dim,
pp_off_dim,
precision_type == rocfft_precision_double,
launcher.sbrc_type,
launcher.sbrc_transpose_type);
}
generated_launchers.emplace_back(kernel,
launcher.scheme,
pp_child_scheme,
pp_factors_curr,
pp_factors_other,
pp_current_dim,
pp_off_dim,
precision_type,
gcn_arch_name,
launcher.sbrc_type,
launcher.sbrc_transpose_type);
}
}

Expand Down Expand Up @@ -255,9 +263,10 @@ void stockham_partial_pass_variants(const std::string& kernel_name
if(params_1.current_dim == 0 && params_2.current_dim == 2)
{
StockhamPartialPassKernelRR kernelRR(specs1, params_1);
make_launcher(specs1.precisions,
make_launcher(specs1.precision,
{{"pp_stoc", specs1.scheme, "", ""}},
kernelRR,
specs1.gcn_arch_name,
"CS_KERNEL_STOCKHAM_PP",
params_1.pp_factors_curr,
params_1.pp_factors_other,
Expand All @@ -266,9 +275,10 @@ void stockham_partial_pass_variants(const std::string& kernel_name
launchers);

StockhamPartialPassKernelCC kernelCC(specs2, params_2, false);
make_launcher(specs2.precisions,
make_launcher(specs2.precision,
{{"pp_sbcc", specs2.scheme, "", ""}},
kernelCC,
specs2.gcn_arch_name,
"CS_KERNEL_STOCKHAM_PP_BLOCK_CC",
params_2.pp_factors_curr,
params_2.pp_factors_other,
Expand All @@ -279,9 +289,10 @@ void stockham_partial_pass_variants(const std::string& kernel_name
else if(params_1.current_dim == 2 && params_2.current_dim == 0)
{
StockhamPartialPassKernelRR kernelCC(specs1, params_1);
make_launcher(specs1.precisions,
make_launcher(specs1.precision,
{{"pp_sbcc", specs1.scheme, "", ""}},
kernelCC,
specs1.gcn_arch_name,
"CS_KERNEL_STOCKHAM_PP_BLOCK_CC",
params_1.pp_factors_curr,
params_1.pp_factors_other,
Expand All @@ -290,9 +301,10 @@ void stockham_partial_pass_variants(const std::string& kernel_name
launchers);

StockhamPartialPassKernelCC kernelRR(specs2, params_2, false);
make_launcher(specs2.precisions,
make_launcher(specs2.precision,
{{"pp_stoc", specs2.scheme, "", ""}},
kernelRR,
specs2.gcn_arch_name,
"CS_KERNEL_STOCKHAM_PP",
params_2.pp_factors_curr,
params_2.pp_factors_other,
Expand Down Expand Up @@ -341,9 +353,10 @@ void stockham_variants(const std::string& kernel_name,
if(specs.scheme == "CS_KERNEL_STOCKHAM")
{
StockhamKernelRR kernel(specs);
make_launcher(specs.precisions,
make_launcher(specs.precision,
{{"stoc", specs.scheme, "", ""}},
kernel,
specs.gcn_arch_name,
"CS_NONE",
std::vector<unsigned int>(),
std::vector<unsigned int>(),
Expand All @@ -354,9 +367,10 @@ void stockham_variants(const std::string& kernel_name,
else if(specs.scheme == "CS_KERNEL_STOCKHAM_BLOCK_CC")
{
StockhamKernelCC kernel(specs, false, false);
make_launcher(specs.precisions,
make_launcher(specs.precision,
{{"sbcc", specs.scheme, "", ""}},
kernel,
specs.gcn_arch_name,
"CS_NONE",
std::vector<unsigned int>(),
std::vector<unsigned int>(),
Expand Down Expand Up @@ -405,9 +419,10 @@ void stockham_variants(const std::string& kernel_name,
"SBRC_3D_FFT_ERC_TRANS_Z_XY",
"TILE_UNALIGNED"});

make_launcher(specs.precisions,
make_launcher(specs.precision,
suffixes,
kernel,
specs.gcn_arch_name,
"CS_NONE",
std::vector<unsigned int>(),
std::vector<unsigned int>(),
Expand All @@ -419,9 +434,10 @@ void stockham_variants(const std::string& kernel_name,
{
StockhamKernelCR kernel(specs);

make_launcher(specs.precisions,
make_launcher(specs.precision,
{{"sbcr", specs.scheme, "", ""}},
kernel,
specs.gcn_arch_name,
"CS_NONE",
std::vector<unsigned int>(),
std::vector<unsigned int>(),
Expand All @@ -433,39 +449,24 @@ void stockham_variants(const std::string& kernel_name,
{
StockhamKernelFused2D fused2d(specs, specs2d);

// output 2D launchers
for(auto prec_type : specs.precisions)
{
launchers.emplace_back(fused2d,
specs.scheme,
"CS_NONE",
std::vector<unsigned int>(),
std::vector<unsigned int>(),
0,
0,
(prec_type == rocfft_precision_double),
"",
"");
}
launchers.emplace_back(fused2d,
specs.scheme,
"CS_NONE",
std::vector<unsigned int>(),
std::vector<unsigned int>(),
0,
0,
specs.precision,
specs.gcn_arch_name,
"",
"");
}
else
throw std::runtime_error("unhandled scheme");

output_json(launchers, kernel_name, output);
}

static size_t max_bytes_per_element(const std::vector<unsigned int>& precisions)
{
// generate for the maximum element size in the available
// precisions
size_t element_size = 0;
for(auto p : precisions)
{
element_size = std::max(element_size, complex_type_size(static_cast<rocfft_precision>(p)));
}
return element_size;
}

// =========================================================
// Partial pass parameters row-major ordering helpers.
// Kernel configuration parameters for CS_3D_PP in
Expand Down Expand Up @@ -636,9 +637,11 @@ int main()
std::string line;

std::string kernel_name;
std::string gcn_arch_name;
std::string scheme;
bool half_lds;
unsigned int lds_size_bytes;
unsigned int bytes_per_element;

const char* DELIM = "";
std::cout << "{";
Expand All @@ -661,6 +664,9 @@ int main()
lds_size_bytes = std::stoul(*arg);
++arg;

bytes_per_element = std::stoul(*arg);
++arg;

std::string kernel_name = *arg;

++arg;
Expand Down Expand Up @@ -698,8 +704,11 @@ int main()
threads_per_transform = parse_uints_csv(*arg);

++arg;
std::vector<unsigned int> precisions;
precisions = parse_uints_csv(*arg);
gcn_arch_name = *arg;

++arg;
unsigned int precision;
precision = std::stoul(*arg);

// create spec and pass to stockham_variants, writes partial output to stdout
std::cout << DELIM;
Expand Down Expand Up @@ -733,12 +742,14 @@ int main()
if(direct_to_from_reg.size() != 2)
throw std::runtime_error("CS_3D_PP requires two direct_to_from_reg configuration");

StockhamGeneratorSpecs specs1(factors1, {}, precisions, workgroup_size[0], scheme);
StockhamGeneratorSpecs specs1(
factors1, {}, precision, gcn_arch_name, workgroup_size[0], scheme);
specs1.direct_to_from_reg = direct_to_from_reg[0];
specs1.threads_per_transform = threads_per_transform[0];
specs1.wgs_is_derived = true;

StockhamGeneratorSpecs specs2(factors2, {}, precisions, workgroup_size[1], scheme);
StockhamGeneratorSpecs specs2(
factors2, {}, precision, gcn_arch_name, workgroup_size[1], scheme);
specs2.direct_to_from_reg = direct_to_from_reg[1];
specs2.threads_per_transform = threads_per_transform[1];
specs2.wgs_is_derived = true;
Expand Down Expand Up @@ -769,24 +780,28 @@ int main()
++arg;
factors = parse_uints_csv(*arg);

StockhamGeneratorSpecs specs(factors, factors2d, precisions, workgroup_size[0], scheme);
StockhamGeneratorSpecs specs(
factors, factors2d, precision, gcn_arch_name, workgroup_size[0], scheme);
specs.half_lds = half_lds;
specs.direct_to_from_reg = direct_to_from_reg[0];

specs.bytes_per_element = max_bytes_per_element(precisions);
specs.bytes_per_element = bytes_per_element;

specs.threads_per_transform = threads_per_transform.front();

// second dimension for 2D_SINGLE
StockhamGeneratorSpecs specs2d(
factors2d, factors, precisions, workgroup_size[0], scheme);
factors2d, factors, precision, gcn_arch_name, workgroup_size[0], scheme);

if(!threads_per_transform.empty())
specs2d.threads_per_transform = threads_per_transform.back();

// 2D_SINGLE kernels use the specified workgroup size
// directly
if(scheme == "CS_KERNEL_2D_SINGLE")
// directly.
// Kernels with an architecture other than generic will
// also use the workgroup size directly, as the calculations
// for wgs_is_derived=false assume an LDS size of 64KiB.
if(scheme == "CS_KERNEL_2D_SINGLE" || gcn_arch_name != generic_gcn_arch_name)
{
specs.wgs_is_derived = true;
specs2d.wgs_is_derived = true;
Expand Down
9 changes: 6 additions & 3 deletions projects/rocfft/library/src/device/generator/stockham_gen.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,14 @@ struct StockhamGeneratorSpecs
{
StockhamGeneratorSpecs(const std::vector<unsigned int>& factors,
const std::vector<unsigned int>& factors2d,
const std::vector<unsigned int>& precisions,
unsigned int precision,
const std::string& gcn_arch_name,
unsigned int workgroup_size,
const std::string& scheme)
: factors(factors)
, factors2d(factors2d)
, precisions(precisions)
, precision(precision)
, gcn_arch_name(gcn_arch_name)
, length(product(factors.begin(), factors.end()))
, length2d(product(factors2d.begin(), factors2d.end()))
, workgroup_size(workgroup_size)
Expand All @@ -47,7 +49,8 @@ struct StockhamGeneratorSpecs

std::vector<unsigned int> factors;
std::vector<unsigned int> factors2d;
std::vector<unsigned int> precisions; // mapped from rocfft_precision
unsigned int precision; // mapped from rocfft_precision
std::string gcn_arch_name;
unsigned int length;
unsigned int length2d = 0;

Expand Down
Loading