From 675fe00a3f07b2a51fdb94b892777556b655ab52 Mon Sep 17 00:00:00 2001 From: Raphael Egan Date: Wed, 25 Jun 2025 21:54:37 +0000 Subject: [PATCH 1/5] Enable (and test for) externally-managed workareas via hipfft_params --- clients/hipfft_params.h | 419 +++++++++++++++++++++++++--- clients/tests/accuracy_test_1D.cpp | 35 ++- clients/tests/accuracy_test_2D.cpp | 21 ++ clients/tests/accuracy_test_3D.cpp | 22 ++ clients/tests/gtest_main.cpp | 8 + clients/tests/multi_device_test.cpp | 18 +- library/src/amd_detail/hipfft.cpp | 58 ++-- shared/accuracy_test.h | 4 +- shared/fft_params.h | 68 ++++- shared/params_gen.h | 54 +++- shared/rocfft_params.h | 6 +- 11 files changed, 588 insertions(+), 125 deletions(-) diff --git a/clients/hipfft_params.h b/clients/hipfft_params.h index 175f5f20..c707f1ac 100644 --- a/clients/hipfft_params.h +++ b/clients/hipfft_params.h @@ -182,21 +182,43 @@ class hipfft_params : public fft_params // backend library can write N worksize values for N GPUs, so // allocate a vector for that if necessary - std::vector xt_worksize; + std::vector auto_allocated_worksizes; + // if auto_allocate == fft_auto_allocation_off, the hipFFT plan(s) + // will be provided with externally-managed work area(s): + static std::vector externally_managed_workareas; - // pointer we pass to the backend library. By default point to the - // single-GPU workbuffer size. - size_t* workbuffersize_ptr; + size_t auto_allocated_extra_vram_footprint() const + { + return std::accumulate(auto_allocated_worksizes.begin(), + auto_allocated_worksizes.end(), + static_cast(0)); + } - hipfft_params() + static size_t externally_managed_extra_vram_footprint() { - workbuffersize_ptr = &workbuffersize; + return std::accumulate(externally_managed_workareas.begin(), + externally_managed_workareas.end(), + static_cast(0), + [](size_t total, const gpubuf& buf) { return total + buf.size(); }); } + bool is_preventing_auto_allocation_at_generation() const + { + if(auto_allocate != fft_auto_allocation_off) + return false; + // Let hipFFT sometimes auto-allocate nonetheless so that tests cover its + // ability to free resources (allocated at generation) when/if some + // externally-managed workarea(s) are provided after plan generation + // Note: this member function must return the same result even if called + // more than once by a given instance, it must be stable for any instance + return (random_seed + std::hash()(token())) % 2 == 1; + } + + hipfft_params() = default; + hipfft_params(const fft_params& p) : fft_params(p) { - workbuffersize_ptr = &workbuffersize; } ~hipfft_params() @@ -228,12 +250,14 @@ class hipfft_params : public fft_params } catch(fft_params::work_buffer_alloc_failure& e) { - val += workbuffersize; + val += auto_allocated_extra_vram_footprint(); + val += externally_managed_extra_vram_footprint(); std::stringstream msg; msg << "Plan work buffer size (" << val << " bytes raw data) too large for device"; throw ROCFFT_SKIP{msg.str()}; } - val += workbuffersize; + val += auto_allocated_extra_vram_footprint(); + val += externally_managed_extra_vram_footprint(); return val; } @@ -358,6 +382,11 @@ class hipfft_params : public fft_params int_inembed[i] = ll_inembed[i]; int_onembed[i] = ll_onembed[i]; } + // reset auto_allocated_worksizes + auto_allocated_worksizes.resize(get_num_used_gpus()); + std::for_each(auto_allocated_worksizes.begin(), + auto_allocated_worksizes.end(), + [](decltype(auto_allocated_worksizes)::value_type& val) { val = 0; }); hipfftResult ret = HIPFFT_SUCCESS; return fft_status_from_hipfftparams(ret); @@ -424,13 +453,34 @@ class hipfft_params : public fft_params } } + if(ret == HIPFFT_SUCCESS && auto_allocate == fft_auto_allocation_off) + { + ret = set_externally_managed_work_areas(); + } + // hipFFT can fail plan creation due to allocation failure - // tests are expecting a specific exception in that case, // because the test was unable to run. Doesn't mean the test // case failed. if(ret == HIPFFT_ALLOC_FAILED) - throw fft_params::work_buffer_alloc_failure( - "plan create failed due to allocation failure"); + { + if(!final_attempt_at_plan_creation && externally_managed_extra_vram_footprint() > 0) + { + final_attempt_at_plan_creation = true; + // device allocation(s) in externally_managed_workareas might be + // larger than needed or even unnecessary for the instance of interest. + // Free them up and try again before concluding. + externally_managed_workareas.clear(); + return create_plan(); + } + else + { + throw fft_params::work_buffer_alloc_failure( + "plan create failed due to allocation failure", + externally_managed_extra_vram_footprint() + + auto_allocated_extra_vram_footprint()); + } + } // store token to check if plan was already made current_token = token(); @@ -439,6 +489,13 @@ class hipfft_params : public fft_params void validate_fields() const override { + if(multiGPU > 1 && auto_allocate == fft_auto_allocation_off) + { + // hipfftXtSetWorkArea would be required + throw std::runtime_error( + "cannot request externally-managed work areas with multi-gpu usage"); + } + validate_brick_volume(); // multi-process only works with batch-1 FFTs, as hipFFT has @@ -1005,13 +1062,251 @@ class hipfft_params : public fft_params CREATE_XT_MAKE_PLAN_MANY, }; + // check that worksize estimates can be successfully queried with or without a valid plan + hipfftResult_t check_worksize_estimate() + { + hipfftResult_t ret{HIPFFT_INTERNAL_ERROR}; + if(!hipfft_transform_type) + { + throw std::runtime_error("Estimating worksize requires a valid type of transform"); + } + std::vector worksize_estimate(get_num_used_gpus(), absurd_init_worksize_estimate); + switch(get_create_type()) + { + case CREATE_MAKE_PLAN_Nd: + { + switch(dim()) + { + case 1: + if(plan == INVALID_PLAN_HANDLE) + ret = hipfftEstimate1d( + int_length[0], *hipfft_transform_type, nbatch, worksize_estimate.data()); + else + ret = hipfftGetSize1d(plan, + int_length[0], + *hipfft_transform_type, + nbatch, + worksize_estimate.data()); + break; + case 2: + if(plan == INVALID_PLAN_HANDLE) + ret = hipfftEstimate2d(int_length[0], + int_length[1], + *hipfft_transform_type, + worksize_estimate.data()); + else + ret = hipfftGetSize2d(plan, + int_length[0], + int_length[1], + *hipfft_transform_type, + worksize_estimate.data()); + break; + case 3: + if(plan == INVALID_PLAN_HANDLE) + ret = hipfftEstimate3d(int_length[0], + int_length[1], + int_length[2], + *hipfft_transform_type, + worksize_estimate.data()); + else + ret = hipfftGetSize3d(plan, + int_length[0], + int_length[1], + int_length[2], + *hipfft_transform_type, + worksize_estimate.data()); + break; + default: + throw std::runtime_error("invalid dim"); + } + break; + } + case CREATE_MAKE_PLAN_MANY: + { + auto layout_args = make_valid_layout_args_for_plan_many(); + if(plan == INVALID_PLAN_HANDLE) + ret = hipfftEstimateMany(dim(), + int_length.data(), + layout_args.input_embed, + layout_args.input_stride, + layout_args.input_distance, + layout_args.output_embed, + layout_args.output_stride, + layout_args.output_distance, + *hipfft_transform_type, + nbatch, + worksize_estimate.data()); + else + ret = hipfftGetSizeMany(plan, + dim(), + int_length.data(), + layout_args.input_embed, + layout_args.input_stride, + layout_args.input_distance, + layout_args.output_embed, + layout_args.output_stride, + layout_args.output_distance, + *hipfft_transform_type, + nbatch, + worksize_estimate.data()); + break; + } + case CREATE_MAKE_PLAN_MANY64: + { + if(plan == INVALID_PLAN_HANDLE) + { + // no direct equivalent in estimate-fetching APIs + std::for_each(worksize_estimate.begin(), + worksize_estimate.end(), + [](decltype(worksize_estimate)::value_type& val) { val = 0; }); + ret = HIPFFT_SUCCESS; + } + else + { + auto layout_args = make_valid_layout_args_for_plan_many(); + ret = hipfftGetSizeMany64(plan, + dim(), + ll_length.data(), + layout_args.input_embed, + layout_args.input_stride, + layout_args.input_distance, + layout_args.output_embed, + layout_args.output_stride, + layout_args.output_distance, + *hipfft_transform_type, + nbatch, + worksize_estimate.data()); + } + break; + } + case CREATE_XT_MAKE_PLAN_MANY: + { + if(plan == INVALID_PLAN_HANDLE) + { + // no direct equivalent in estimate-fetching APIs + std::for_each(worksize_estimate.begin(), + worksize_estimate.end(), + [](decltype(worksize_estimate)::value_type& val) { val = 0; }); + ret = HIPFFT_SUCCESS; + } + else + { + auto executionType = get_xt_api_execution_type(); + auto layout_args = make_valid_layout_args_for_plan_many(); + ret = hipfftXtGetSizeMany(plan, + dim(), + ll_length.data(), + layout_args.input_embed, + layout_args.input_stride, + layout_args.input_distance, + inputType, + layout_args.output_embed, + layout_args.output_stride, + layout_args.output_distance, + outputType, + nbatch, + worksize_estimate.data(), + executionType); + } + break; + } + case PLAN_Nd: + case PLAN_MANY: + default: + { + // should be indirectly disabled via get_create_type() + return HIPFFT_INTERNAL_ERROR; + } + } + // check that the value(s) of worksize_estimate were actually set, assuming that + // setting a worksize_estimate equal to absurd_init_worksize_estimate by hipFFT + // cannot be considered "correct". + // Note: worksize_estimate value(s) are *not* guaranteed to be greater than or equal + // to the actual value(s) of the work area(s), queriable after plan generation via + // hipfftGetSize. + if(ret == HIPFFT_SUCCESS) + { + // the estimate can't have any knowledge about the number of GPUs being used if + // the plan wasn't created first + const size_t num_values_to_check + = plan == INVALID_PLAN_HANDLE ? 1 : worksize_estimate.size(); + for(auto idx = 0; ret == HIPFFT_SUCCESS && idx < num_values_to_check; idx++) + { + ret = worksize_estimate[idx] != absurd_init_worksize_estimate + ? HIPFFT_SUCCESS + : HIPFFT_INTERNAL_ERROR; + } + } + return ret; + } + + // provide a work area to a successfully generated plan + hipfftResult_t set_externally_managed_work_areas() + { + std::vector req_workarea_sizes(get_num_used_gpus(), absurd_init_worksize_estimate); + hipfftResult_t ret = hipfftGetSize(plan, req_workarea_sizes.data()); + if(ret != HIPFFT_SUCCESS) + { + return ret; + } + else if(std::any_of(req_workarea_sizes.begin(), + req_workarea_sizes.end(), + [](const decltype(req_workarea_sizes)::value_type& val) { + return val == absurd_init_worksize_estimate; + })) + { + return HIPFFT_INTERNAL_ERROR; + } + // req_workarea_sizes are known and validated + // check if the current externally_managed_workareas can be used as is or not + if(externally_managed_workareas.size() < get_num_used_gpus()) + externally_managed_workareas.resize(get_num_used_gpus()); + std::vector workareas(get_num_used_gpus(), nullptr); + for(auto workarea_idx = 0; workarea_idx < get_num_used_gpus(); workarea_idx++) + { + const auto req_size = req_workarea_sizes[workarea_idx]; + auto& buf = externally_managed_workareas[workarea_idx]; + if(buf.size() < req_size) + { + // too small, free and reallocate to meet current needs + buf.free(); + if(buf.alloc(req_size) != hipSuccess) + { + return HIPFFT_ALLOC_FAILED; + } + } + workareas[workarea_idx] = buf.data(); + } + if(get_num_used_gpus() > 1) + { + // TODO: enable below once hipfftXtSetWorkArea is enabled +#if(0) + ret = hipfftXtSetWorkArea(plan, workareas.data); +#else + throw std::runtime_error( + "cannot request externally-managed work areas with multi-gpu usage"); +#endif + } + else + { + ret = hipfftSetWorkArea(plan, workareas[0]); + } + if(ret == HIPFFT_SUCCESS) + { + // the above "SetWorkArea" frees auto_allocated worksizes (if any) + auto_allocated_worksizes.clear(); + } + return ret; + } + // return true if we need to use hipFFT APIs that separate plan // allocation and plan init bool need_separate_create_make() const { - // scale factor and multi-GPU need API calls between create + - // init - if(scale_factor != 1.0 || multiGPU > 1 || mp_lib != fft_mp_lib_none) + // scale factor and multi-GPU and disabled auto-allocation need API + // calls between create + init + if(scale_factor != 1.0 || multiGPU > 1 || mp_lib != fft_mp_lib_none + || auto_allocate == fft_auto_allocation_off) return true; return false; } @@ -1145,7 +1440,14 @@ class hipfft_params : public fft_params // relevant pre-Make APIs (scale factor, XtSetGPUs) hipfftResult_t create_with_pre_make() { - auto ret = hipfftCreate(&plan); + hipfftResult_t ret{HIPFFT_INVALID_PLAN}; + if(auto_allocate == fft_auto_allocation_off) + { + ret = check_worksize_estimate(); // read worksize estimate before plan creation + if(ret != HIPFFT_SUCCESS) + return ret; + } + ret = hipfftCreate(&plan); if(ret != HIPFFT_SUCCESS) return ret; if(scale_factor != 1.0) @@ -1157,7 +1459,8 @@ class hipfft_params : public fft_params if(multiGPU > 1) { int deviceCount = 0; - (void)hipGetDeviceCount(&deviceCount); + if(hipGetDeviceCount(&deviceCount) != hipSuccess) + throw std::runtime_error("hipGetDeviceCount failed"); // ensure that users request less than or equal to the total number of devices if(static_cast(multiGPU) > deviceCount) @@ -1166,9 +1469,8 @@ class hipfft_params : public fft_params std::vector GPUs(multiGPU); std::iota(GPUs.begin(), GPUs.end(), 0); ret = hipfftXtSetGPUs(plan, static_cast(multiGPU), GPUs.data()); - - xt_worksize.resize(GPUs.size()); - workbuffersize_ptr = xt_worksize.data(); + if(ret != HIPFFT_SUCCESS) + return ret; } if(mp_lib == fft_mp_lib_mpi) { @@ -1221,10 +1523,23 @@ class hipfft_params : public fft_params input_stride.data(), output_stride.data()); } + if(ret != HIPFFT_SUCCESS) + return ret; #else throw std::runtime_error("MPI is not enabled"); #endif } + if(auto_allocate == fft_auto_allocation_off) + { + ret = check_worksize_estimate(); // read worksize estimate again after plan creation + if(ret != HIPFFT_SUCCESS) + return ret; + } + if(is_preventing_auto_allocation_at_generation()) + { + ret = hipfftSetAutoAllocation(plan, 0); + } + return ret; } @@ -1233,22 +1548,25 @@ class hipfft_params : public fft_params auto ret = create_with_pre_make(); if(ret != HIPFFT_SUCCESS) return ret; - + // do not register plan's worksizes as "auto-allocated" if auto-allocation was explicitly prevented + size_t* worksize_ptr = is_preventing_auto_allocation_at_generation() + ? nullptr + : auto_allocated_worksizes.data(); switch(dim()) { case 1: return hipfftMakePlan1d( - plan, int_length[0], *hipfft_transform_type, nbatch, workbuffersize_ptr); + plan, int_length[0], *hipfft_transform_type, nbatch, worksize_ptr); case 2: return hipfftMakePlan2d( - plan, int_length[0], int_length[1], *hipfft_transform_type, workbuffersize_ptr); + plan, int_length[0], int_length[1], *hipfft_transform_type, worksize_ptr); case 3: return hipfftMakePlan3d(plan, int_length[0], int_length[1], int_length[2], *hipfft_transform_type, - workbuffersize_ptr); + worksize_ptr); default: throw std::runtime_error("invalid dim"); } @@ -1259,7 +1577,11 @@ class hipfft_params : public fft_params auto ret = create_with_pre_make(); if(ret != HIPFFT_SUCCESS) return ret; - auto layout_args = make_valid_layout_args_for_plan_many(); + // do not register plan's worksizes as "auto-allocated" if auto-allocation was explicitly prevented + size_t* worksize_ptr = is_preventing_auto_allocation_at_generation() + ? nullptr + : auto_allocated_worksizes.data(); + auto layout_args = make_valid_layout_args_for_plan_many(); return hipfftMakePlanMany(plan, dim(), int_length.data(), @@ -1271,7 +1593,7 @@ class hipfft_params : public fft_params layout_args.output_distance, *hipfft_transform_type, nbatch, - workbuffersize_ptr); + worksize_ptr); } hipfftResult_t create_make_plan_many64() @@ -1279,7 +1601,12 @@ class hipfft_params : public fft_params auto ret = create_with_pre_make(); if(ret != HIPFFT_SUCCESS) return ret; - auto layout_args = make_valid_layout_args_for_plan_many(); + + // do not register plan's worksizes as "auto-allocated" if auto-allocation was explicitly prevented + size_t* worksize_ptr = is_preventing_auto_allocation_at_generation() + ? nullptr + : auto_allocated_worksizes.data(); + auto layout_args = make_valid_layout_args_for_plan_many(); return hipfftMakePlanMany64(plan, dim(), ll_length.data(), @@ -1291,33 +1618,42 @@ class hipfft_params : public fft_params layout_args.output_distance, *hipfft_transform_type, nbatch, - workbuffersize_ptr); + worksize_ptr); } - hipfftResult_t create_xt_make_plan_many() + hipDataType get_xt_api_execution_type() const { - auto ret = create_with_pre_make(); - if(ret != HIPFFT_SUCCESS) - return ret; - // execution type is always complex, matching the precision // of the transform // Initializing as double by default - hipDataType executionType = HIP_C_64F; + hipDataType ret = HIP_C_64F; switch(precision) { case fft_precision_half: - executionType = HIP_C_16F; + ret = HIP_C_16F; break; case fft_precision_single: - executionType = HIP_C_32F; + ret = HIP_C_32F; break; case fft_precision_double: - executionType = HIP_C_64F; + ret = HIP_C_64F; break; } + return ret; + } + + hipfftResult_t create_xt_make_plan_many() + { + auto ret = create_with_pre_make(); + if(ret != HIPFFT_SUCCESS) + return ret; - auto layout_args = make_valid_layout_args_for_plan_many(); + // do not register plan's worksizes as "auto-allocated" if auto-allocation was explicitly prevented + size_t* worksize_ptr = is_preventing_auto_allocation_at_generation() + ? nullptr + : auto_allocated_worksizes.data(); + auto executionType = get_xt_api_execution_type(); + auto layout_args = make_valid_layout_args_for_plan_many(); return hipfftXtMakePlanMany(plan, dim(), ll_length.data(), @@ -1330,9 +1666,16 @@ class hipfft_params : public fft_params layout_args.output_distance, outputType, nbatch, - workbuffersize_ptr, + worksize_ptr, executionType); } + static constexpr size_t absurd_init_worksize_estimate = std::numeric_limits::max(); + bool final_attempt_at_plan_creation = false; + + size_t get_num_used_gpus() const + { + return multiGPU > 1 ? multiGPU : 1; + }; }; #endif diff --git a/clients/tests/accuracy_test_1D.cpp b/clients/tests/accuracy_test_1D.cpp index 5593ec4d..b2aa382a 100644 --- a/clients/tests/accuracy_test_1D.cpp +++ b/clients/tests/accuracy_test_1D.cpp @@ -81,14 +81,8 @@ static std::vector small_1D_sizes() static const size_t SMALL_1D_MAX = 8192; // generate a list of sizes from 2 and up, skipping any sizes that are already covered - std::vector covered_sizes; - std::copy(pow2_range.begin(), pow2_range.end(), std::back_inserter(covered_sizes)); - std::copy(pow3_range.begin(), pow3_range.end(), std::back_inserter(covered_sizes)); - std::copy(pow5_range.begin(), pow5_range.end(), std::back_inserter(covered_sizes)); - std::copy(radX_range.begin(), radX_range.end(), std::back_inserter(covered_sizes)); - std::copy(mix_range.begin(), mix_range.end(), std::back_inserter(covered_sizes)); - std::copy(prime_range.begin(), prime_range.end(), std::back_inserter(covered_sizes)); - std::sort(covered_sizes.begin(), covered_sizes.end()); + std::vector covered_sizes = merge_and_sort_values( + {pow2_range, pow3_range, pow5_range, radX_range, mix_range, prime_range}); std::vector output; for(size_t i = 2; i < SMALL_1D_MAX; ++i) @@ -319,12 +313,14 @@ INSTANTIATE_TEST_SUITE_P(DISABLED_offset_mix_1D, // small 1D sizes just need to make sure our factorization isn't // completely broken, so we just check simple C2C outplace interleaved +const static std::vector small_1D_lengths = small_1D_sizes(); + INSTANTIATE_TEST_SUITE_P( small_1D, accuracy_test, ::testing::ValuesIn(param_generator_base(test_prob, {fft_transform_type_complex_forward}, - generate_lengths({small_1D_sizes()}), + generate_lengths({small_1D_lengths}), {fft_precision_single}, {1}, generate_types, @@ -530,3 +526,24 @@ INSTANTIATE_TEST_SUITE_P( ooffset_range_zero, place_range)), accuracy_test::TestName); + +const static std::vector lengths_for_disabled_autoalloc = merge_and_sort_values( + {pow2_range, pow3_range, pow5_range, radX_range, mix_range, small_1D_lengths, prime_range}, + 128); + +INSTANTIATE_TEST_SUITE_P( + various_1D, + accuracy_test, + ::testing::ValuesIn(param_generator(test_prob, + generate_lengths({lengths_for_disabled_autoalloc}), + precision_range_sp_dp, + batch_range_1D, + stride_range, + stride_range, + ioffset_range_zero, + ooffset_range_zero, + place_range, + false, + false, + fft_auto_allocation_off)), + accuracy_test::TestName); diff --git a/clients/tests/accuracy_test_2D.cpp b/clients/tests/accuracy_test_2D.cpp index 5f1b8a98..6f45abe4 100644 --- a/clients/tests/accuracy_test_2D.cpp +++ b/clients/tests/accuracy_test_2D.cpp @@ -278,3 +278,24 @@ INSTANTIATE_TEST_SUITE_P(len1_swap_2D, false, false)), accuracy_test::TestName); + +const static std::vector lengths_for_disabled_autoalloc + = merge_and_sort_values({pow2_range, pow3_range, prime_range, mix_range}, 12); + +INSTANTIATE_TEST_SUITE_P( + various_2D, + accuracy_test, + ::testing::ValuesIn(param_generator(test_prob, + generate_lengths({lengths_for_disabled_autoalloc, + lengths_for_disabled_autoalloc}), + precision_range_sp_dp, + batch_range, + stride_range, + stride_range, + ioffset_range_zero, + ooffset_range_zero, + place_range, + false, + false, + fft_auto_allocation_off)), + accuracy_test::TestName); diff --git a/clients/tests/accuracy_test_3D.cpp b/clients/tests/accuracy_test_3D.cpp index f3386950..f3f780de 100644 --- a/clients/tests/accuracy_test_3D.cpp +++ b/clients/tests/accuracy_test_3D.cpp @@ -284,3 +284,25 @@ INSTANTIATE_TEST_SUITE_P( false, false)), accuracy_test::TestName); + +const static std::vector lengths_for_disabled_autoalloc = merge_and_sort_values( + {pow2_range, pow3_range, pow5_range, prime_range, sbrc_range}, 5); + +INSTANTIATE_TEST_SUITE_P( + various_3D, + accuracy_test, + ::testing::ValuesIn(param_generator(test_prob, + generate_lengths({lengths_for_disabled_autoalloc, + lengths_for_disabled_autoalloc, + lengths_for_disabled_autoalloc}), + precision_range_sp_dp, + batch_range, + stride_range, + stride_range, + ioffset_range_zero, + ooffset_range_zero, + place_range, + false, + false, + fft_auto_allocation_off)), + accuracy_test::TestName); diff --git a/clients/tests/gtest_main.cpp b/clients/tests/gtest_main.cpp index e41718d4..1366ad3c 100644 --- a/clients/tests/gtest_main.cpp +++ b/clients/tests/gtest_main.cpp @@ -43,6 +43,9 @@ #include "hipfft_accuracy_test.h" #include "hipfft_test_params.h" +// initialize static class member of hipfft_params +std::vector hipfft_params::externally_managed_workareas = std::vector(); + // Control output verbosity: int verbose; @@ -340,6 +343,11 @@ int main(int argc, char* argv[]) non_token->add_flag("--callback", "Inject load/store callbacks")->each([&](const std::string&) { manual_params.run_callbacks = true; }); + non_token + ->add_option("--auto_allocation", + manual_params.auto_allocate, + "Backend library's auto-allocation behavior: \"on\", \"off\", or \"default\"") + ->default_val("default"); non_token ->add_flag("--double", "Double precision transform (deprecated: use --precision double)") ->each([&](const std::string&) { manual_params.precision = fft_precision_double; }); diff --git a/clients/tests/multi_device_test.cpp b/clients/tests/multi_device_test.cpp index 661d5791..4dc9500c 100644 --- a/clients/tests/multi_device_test.cpp +++ b/clients/tests/multi_device_test.cpp @@ -53,7 +53,9 @@ enum SplitType PENCIL_3D, }; -std::vector param_generator_multi_gpu(const std::optional type) +std::vector param_generator_multi_gpu(const std::optional type, + fft_auto_allocation auto_alloc_setting + = fft_auto_allocation_default) { int localDeviceCount = 0; (void)hipGetDeviceCount(&localDeviceCount); @@ -80,7 +82,9 @@ std::vector param_generator_multi_gpu(const std::optional ioffset_range_zero, ooffset_range_zero, place_range, - false); + false, + false, + auto_alloc_setting); auto params_real = param_generator_real(test_prob, multi_gpu_sizes, @@ -91,7 +95,9 @@ std::vector param_generator_multi_gpu(const std::optional ioffset_range_zero, ooffset_range_zero, {fft_placement_notinplace}, - false); + false, + false, + auto_alloc_setting); std::vector all_params; @@ -229,3 +235,9 @@ INSTANTIATE_TEST_SUITE_P(multi_gpu, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu({})), accuracy_test::TestName); + +INSTANTIATE_TEST_SUITE_P(DISABLED_multi_gpu, + accuracy_test, + ::testing::ValuesIn(param_generator_multi_gpu({}, + fft_auto_allocation_off)), + accuracy_test::TestName); diff --git a/library/src/amd_detail/hipfft.cpp b/library/src/amd_detail/hipfft.cpp index f4413600..4096550c 100644 --- a/library/src/amd_detail/hipfft.cpp +++ b/library/src/amd_detail/hipfft.cpp @@ -35,15 +35,6 @@ #include "../../../shared/ptrdiff.h" #include "../../../shared/rocfft_hip.h" -#define ROC_FFT_CHECK_ALLOC_FAILED(ret) \ - { \ - auto code = ret; \ - if(code != rocfft_status_success) \ - { \ - return HIPFFT_ALLOC_FAILED; \ - } \ - } - #define ROC_FFT_CHECK_INVALID_VALUE(ret) \ { \ auto code = ret; \ @@ -410,31 +401,6 @@ catch(...) return handle_exception(); } -hipfftResult hipfftPlanMany64(hipfftHandle* plan, - int rank, - long long int* n, - long long int* inembed, - long long int istride, - long long int idist, - long long int* onembed, - long long int ostride, - long long int odist, - hipfftType type, - long long int batch) -try -{ - hipfftHandle handle = nullptr; - HIP_FFT_CHECK_AND_RETURN(hipfftCreate(&handle)); - *plan = handle; - - return hipfftMakePlanMany64( - *plan, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch, nullptr); -} -catch(...) -{ - return handle_exception(); -} - hipfftResult hipfftMakePlan_internal(hipfftHandle plan, size_t dim, size_t* lengths, @@ -1262,6 +1228,7 @@ try hipfftHandle p; HIP_FFT_CHECK_AND_RETURN(hipfftCreate(&p)); + p->autoAllocate = false; HIP_FFT_CHECK_AND_RETURN(hipfftMakePlan1d(p, nx, type, batch, workSize)); HIP_FFT_CHECK_AND_RETURN(hipfftDestroy(p)); @@ -1282,6 +1249,7 @@ try hipfftHandle p; HIP_FFT_CHECK_AND_RETURN(hipfftCreate(&p)); + p->autoAllocate = false; HIP_FFT_CHECK_AND_RETURN(hipfftMakePlan2d(p, nx, ny, type, workSize)); HIP_FFT_CHECK_AND_RETURN(hipfftDestroy(p)); @@ -1303,6 +1271,7 @@ try hipfftHandle p; HIP_FFT_CHECK_AND_RETURN(hipfftCreate(&p)); + p->autoAllocate = false; HIP_FFT_CHECK_AND_RETURN(hipfftMakePlan3d(p, nx, ny, nz, type, workSize)); HIP_FFT_CHECK_AND_RETURN(hipfftDestroy(p)); @@ -1327,10 +1296,13 @@ hipfftResult hipfftGetSizeMany(hipfftHandle plan, size_t* workSize) try { - hipfftHandle p; - HIP_FFT_CHECK_AND_RETURN( - hipfftPlanMany(&p, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch)); - *workSize = p->workBufferSize; + if(workSize == nullptr) + return HIPFFT_INVALID_VALUE; + hipfftHandle p = nullptr; + HIP_FFT_CHECK_AND_RETURN(hipfftCreate(&p)); + p->autoAllocate = false; + HIP_FFT_CHECK_AND_RETURN(hipfftMakePlanMany( + p, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch, workSize)); HIP_FFT_CHECK_AND_RETURN(hipfftDestroy(p)); return HIPFFT_SUCCESS; @@ -1354,10 +1326,13 @@ hipfftResult hipfftGetSizeMany64(hipfftHandle plan, size_t* workSize) try { + if(workSize == nullptr) + return HIPFFT_INVALID_VALUE; hipfftHandle p = nullptr; - HIP_FFT_CHECK_AND_RETURN(hipfftPlanMany64( - &p, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch)); - *workSize = p->workBufferSize; + HIP_FFT_CHECK_AND_RETURN(hipfftCreate(&p)); + p->autoAllocate = false; + HIP_FFT_CHECK_AND_RETURN(hipfftMakePlanMany64( + p, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch, workSize)); HIP_FFT_CHECK_AND_RETURN(hipfftDestroy(p)); return HIPFFT_SUCCESS; @@ -1831,6 +1806,7 @@ try hipfftHandle p; HIP_FFT_CHECK_AND_RETURN(hipfftCreate(&p)); + p->autoAllocate = false; HIP_FFT_CHECK_AND_RETURN(hipfftMakePlanMany_internal( p, rank, n, inembed, istride, idist, onembed, ostride, odist, iotype, batch, workSize)); diff --git a/shared/accuracy_test.h b/shared/accuracy_test.h index d3ea45f0..3194ec58 100644 --- a/shared/accuracy_test.h +++ b/shared/accuracy_test.h @@ -567,7 +567,7 @@ inline void run_round_trip_inverse(Tparams& params, catch(fft_params::work_buffer_alloc_failure& e) { std::stringstream ss; - ss << "Failed to allocate work buffer (size: " << params.workbuffersize << ")"; + ss << "Failed to allocate work buffer (size: " << e.attempted_size << ")"; ++n_hip_failures; if(skip_runtime_fails) { @@ -768,7 +768,7 @@ inline void fft_vs_reference_impl(Tparams& params, bool round_trip) { ++n_hip_failures; std::stringstream ss; - ss << "Work buffer allocation failed with size: " << params.workbuffersize; + ss << "Work buffer allocation failed with size: " << e.attempted_size; if(skip_runtime_fails) { throw ROCFFT_SKIP{ss.str()}; diff --git a/shared/fft_params.h b/shared/fft_params.h index 4ee22e70..e3359b4d 100644 --- a/shared/fft_params.h +++ b/shared/fft_params.h @@ -74,7 +74,7 @@ enum fft_precision fft_precision_double, }; -// Used for CLI11 parsing of input gen enum +// Used for CLI11 parsing of precision enum static bool lexical_cast(const std::string& word, fft_precision& precision) { if(word == "half") @@ -88,6 +88,28 @@ static bool lexical_cast(const std::string& word, fft_precision& precision) return true; } +enum fft_auto_allocation +{ + fft_auto_allocation_on, + fft_auto_allocation_off, + fft_auto_allocation_default +}; + +// Used for CLI11 parsing of auto-allocation enum +static bool lexical_cast(const std::string& word, fft_auto_allocation& auto_allocation) +{ + if(word == "on") + auto_allocation = fft_auto_allocation_on; + else if(word == "off") + auto_allocation = fft_auto_allocation_off; + else if(word == "default") + auto_allocation = fft_auto_allocation_default; + else + throw std::runtime_error( + "Invalid auto-allocation behavior specified (choose \"on\", \"off\", or \"default\")"); + return true; +} + // fft_input_generator: linearly spaced sequence in [-0.5,0.5] // fft_input_random_generator: pseudo-random sequence in [-0.5,0.5] enum fft_input_generator @@ -491,7 +513,7 @@ class fft_params fft_input_generator igen = fft_input_random_generator_host; #endif - size_t workbuffersize = 0; + fft_auto_allocation auto_allocate = fft_auto_allocation_default; enum fft_mp_lib { @@ -1038,6 +1060,12 @@ class fft_params ret += std::to_string(multiGPU); } + if(auto_allocate != fft_auto_allocation_default) + { + ret += "_autoallocation_"; + ret += (auto_allocate == fft_auto_allocation_on ? "on" : "off"); + } + return ret; } @@ -1197,6 +1225,13 @@ class fft_params ++pos; multiGPU = std::stoull(vals[pos++]); } + + auto_allocate = fft_auto_allocation_default; // default if unspecified + if(pos < vals.size() && vals[pos] == "autoallocation") + { + ++pos; + lexical_cast(vals[pos++], auto_allocate); + } } // Stream output operator (for gtest, etc). @@ -2246,8 +2281,10 @@ class fft_params // Tests that hit this can't fit on the GPU and should be skipped. struct work_buffer_alloc_failure : public std::runtime_error { - work_buffer_alloc_failure(const std::string& s) + const size_t attempted_size; + work_buffer_alloc_failure(const std::string& s, size_t _attempted_size = 0) : std::runtime_error(s) + , attempted_size(_attempted_size) { } }; @@ -2272,18 +2309,19 @@ class fft_params throw std::runtime_error("Transform type not forward."); } - length = params_forward.length; - istride = params_forward.ostride; - ostride = params_forward.istride; - nbatch = params_forward.nbatch; - precision = params_forward.precision; - placement = params_forward.placement; - idist = params_forward.odist; - odist = params_forward.idist; - itype = params_forward.otype; - otype = params_forward.itype; - ioffset = params_forward.ooffset; - ooffset = params_forward.ioffset; + length = params_forward.length; + istride = params_forward.ostride; + ostride = params_forward.istride; + nbatch = params_forward.nbatch; + precision = params_forward.precision; + placement = params_forward.placement; + idist = params_forward.odist; + odist = params_forward.idist; + itype = params_forward.otype; + otype = params_forward.itype; + ioffset = params_forward.ooffset; + ooffset = params_forward.ioffset; + auto_allocate = params_forward.auto_allocate; run_callbacks = params_forward.run_callbacks; diff --git a/shared/params_gen.h b/shared/params_gen.h index eb6e77a6..b9644106 100644 --- a/shared/params_gen.h +++ b/shared/params_gen.h @@ -58,6 +58,27 @@ inline double hash_prob(const int seed, const std::string& token) return roll; } +template , bool> = true> +std::vector merge_and_sort_values(const std::vector>& set_of_vecs, + size_t max_num_elem = std::numeric_limits::max()) +{ + std::vector merged; + for(const auto& vec : set_of_vecs) + { + std::copy(vec.begin(), vec.end(), std::back_inserter(merged)); + } + std::sort(merged.begin(), merged.end()); + auto last_unique = std::unique(merged.begin(), merged.end()); + merged.erase(last_unique, merged.end()); + std::ranlux24_base gen(random_seed); + while(merged.size() > max_num_elem) + { + // remove pseudo-randomly chosen elements + merged.erase(merged.begin() + (static_cast(gen()) % merged.size())); + } + return merged; +} + // Given a vector of vector of lengths, generate all unique permutations. // Add an optional vector of ad-hoc lengths to the result. inline std::vector> @@ -248,7 +269,8 @@ inline auto param_generator_base(const double base_p const std::vector>& ooffset_range, const std::vector& place_range, const bool planar = true, - const bool run_callbacks = false) + const bool run_callbacks = false, + const fft_auto_allocation auto_alloc = fft_auto_allocation_default) { std::vector params; @@ -300,6 +322,7 @@ inline auto param_generator_base(const double base_p param.otype = std::get<3>(types); param.ioffset = ioffset; param.ooffset = ooffset; + param.auto_allocate = auto_alloc; if(run_callbacks) { @@ -369,8 +392,8 @@ inline auto param_generator(const double base_prob, const std::vector>& ooffset_range, const std::vector& place_range, const bool planar, - - const bool run_callbacks = false) + const bool run_callbacks = false, + const fft_auto_allocation auto_alloc = fft_auto_allocation_default) { return param_generator_base(base_prob, trans_type_range, @@ -383,9 +406,9 @@ inline auto param_generator(const double base_prob, ioffset_range, ooffset_range, place_range, - planar, - run_callbacks); + run_callbacks, + auto_alloc); } // Create an array of parameters to pass to gtest. Only tests complex-type transforms @@ -398,9 +421,10 @@ inline auto param_generator_complex(const double bas const std::vector>& ioffset_range, const std::vector>& ooffset_range, const std::vector& place_range, - - const bool planar, - const bool run_callbacks = false) + const bool planar, + const bool run_callbacks = false, + const fft_auto_allocation auto_alloc + = fft_auto_allocation_default) { return param_generator_base(base_prob, trans_type_range_complex, @@ -413,9 +437,9 @@ inline auto param_generator_complex(const double bas ioffset_range, ooffset_range, place_range, - planar, - run_callbacks); + run_callbacks, + auto_alloc); } // Create an array of parameters to pass to gtest. @@ -428,9 +452,9 @@ inline auto param_generator_real(const double base_p const std::vector>& ioffset_range, const std::vector>& ooffset_range, const std::vector& place_range, - - const bool planar, - const bool run_callbacks = false) + const bool planar, + const bool run_callbacks = false, + const fft_auto_allocation auto_alloc = fft_auto_allocation_default) { return param_generator_base(test_prob, trans_type_range_real, @@ -443,9 +467,9 @@ inline auto param_generator_real(const double base_p ioffset_range, ooffset_range, place_range, - planar, - run_callbacks); + run_callbacks, + auto_alloc); } template diff --git a/shared/rocfft_params.h b/shared/rocfft_params.h index 6f85785a..c84f60a3 100644 --- a/shared/rocfft_params.h +++ b/shared/rocfft_params.h @@ -170,6 +170,7 @@ class rocfft_params_base : public fft_params rocfft_execution_info info = nullptr; rocfft_plan_description desc = nullptr; gpubuf_t wbuffer; + size_t workbuffersize = 0; explicit rocfft_params_base() = default; @@ -410,7 +411,8 @@ class rocfft_params_base : public fft_params { return ret; } - if(workbuffersize > 0) + // default behavior is to feed rocfft with a work area if it needs one + if(workbuffersize > 0 && auto_allocate != fft_auto_allocation_on) { hipError_t hip_status = hipSuccess; hip_status = wbuffer.alloc(workbuffersize); @@ -429,7 +431,7 @@ class rocfft_params_base : public fft_params { oss << "hipMemGetInfo also failed"; } - throw work_buffer_alloc_failure(oss.str()); + throw work_buffer_alloc_failure(oss.str(), workbuffersize); } auto rocret From 2eda57c36705f2153020904b2c0329d38dd66a59 Mon Sep 17 00:00:00 2001 From: Raphael Egan Date: Mon, 7 Jul 2025 21:46:24 +0000 Subject: [PATCH 2/5] Remove dependency on external random_seed and repair build for hipfft-bench --- clients/bench/bench.cpp | 3 +++ clients/hipfft_params.h | 5 ++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/clients/bench/bench.cpp b/clients/bench/bench.cpp index c9cfed9e..f9b79272 100644 --- a/clients/bench/bench.cpp +++ b/clients/bench/bench.cpp @@ -31,6 +31,9 @@ #include "../../shared/client_except.h" #include "../../shared/gpubuf.h" +// initialize static class member of hipfft_params +std::vector hipfft_params::externally_managed_workareas = std::vector(); + int main(int argc, char* argv[]) { // This helps with mixing output of both wide and narrow characters to the screen diff --git a/clients/hipfft_params.h b/clients/hipfft_params.h index c707f1ac..248cd45c 100644 --- a/clients/hipfft_params.h +++ b/clients/hipfft_params.h @@ -30,7 +30,6 @@ #include "../shared/concurrency.h" #include "../shared/fft_params.h" #include "../shared/hipfft_brick.h" -#include "../shared/test_params.h" #include "hipfft/hipfft.h" #include "hipfft/hipfftXt.h" #include @@ -46,7 +45,7 @@ template hasher; - std::ranlux24_base gen(random_seed + hasher(token)); + std::ranlux24_base gen(hasher(token)); std::uniform_int_distribution dis(static_cast(0), std::numeric_limits::max()); val = dis(gen); ((args = dis(gen)), ...); @@ -211,7 +210,7 @@ class hipfft_params : public fft_params // externally-managed workarea(s) are provided after plan generation // Note: this member function must return the same result even if called // more than once by a given instance, it must be stable for any instance - return (random_seed + std::hash()(token())) % 2 == 1; + return std::hash()(token()) % 2 == 1; } hipfft_params() = default; From f827ad67885fbf24e9853c73829212eb4333a3f4 Mon Sep 17 00:00:00 2001 From: Raphael Egan Date: Tue, 8 Jul 2025 16:53:53 +0000 Subject: [PATCH 3/5] Adding 'auto_allocation' option to hip-bench target and clarifying description thereof --- clients/bench/bench.cpp | 5 +++++ clients/tests/gtest_main.cpp | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/clients/bench/bench.cpp b/clients/bench/bench.cpp index f9b79272..2bc4a3e3 100644 --- a/clients/bench/bench.cpp +++ b/clients/bench/bench.cpp @@ -81,6 +81,11 @@ int main(int argc, char* argv[]) "Type of transform:\n0) complex forward\n1) complex inverse\n2) real " "forward\n3) real inverse") ->default_val(fft_transform_type_complex_forward); + non_token + ->add_option("--auto_allocation", + params.auto_allocate, + "HipFFT's auto-allocation behavior: \"on\", \"off\", or \"default\"") + ->default_val("default"); non_token ->add_option( "--precision", params.precision, "Transform precision: single (default), double, half") diff --git a/clients/tests/gtest_main.cpp b/clients/tests/gtest_main.cpp index 1366ad3c..7aaab7a3 100644 --- a/clients/tests/gtest_main.cpp +++ b/clients/tests/gtest_main.cpp @@ -346,7 +346,7 @@ int main(int argc, char* argv[]) non_token ->add_option("--auto_allocation", manual_params.auto_allocate, - "Backend library's auto-allocation behavior: \"on\", \"off\", or \"default\"") + "HipFFT's auto-allocation behavior: \"on\", \"off\", or \"default\"") ->default_val("default"); non_token ->add_flag("--double", "Double precision transform (deprecated: use --precision double)") From 517f10110ec59fe381bb28662be9803dc42c4a97 Mon Sep 17 00:00:00 2001 From: Raphael Egan Date: Wed, 9 Jul 2025 22:56:34 +0000 Subject: [PATCH 4/5] Using ad-hoc exception type for unimplemented cases and avoiding silent failure for get_xt_api_execution_type in case of unexepected precision --- clients/hipfft_params.h | 13 ++++--------- clients/tests/hipfft_accuracy_test.cpp | 4 ++++ clients/tests/multi_device_test.cpp | 5 +++-- shared/fft_params.h | 9 +++++++++ 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/clients/hipfft_params.h b/clients/hipfft_params.h index 248cd45c..7c76fe06 100644 --- a/clients/hipfft_params.h +++ b/clients/hipfft_params.h @@ -488,13 +488,6 @@ class hipfft_params : public fft_params void validate_fields() const override { - if(multiGPU > 1 && auto_allocate == fft_auto_allocation_off) - { - // hipfftXtSetWorkArea would be required - throw std::runtime_error( - "cannot request externally-managed work areas with multi-gpu usage"); - } - validate_brick_volume(); // multi-process only works with batch-1 FFTs, as hipFFT has @@ -1282,8 +1275,8 @@ class hipfft_params : public fft_params #if(0) ret = hipfftXtSetWorkArea(plan, workareas.data); #else - throw std::runtime_error( - "cannot request externally-managed work areas with multi-gpu usage"); + throw unimplemented_exception( + "No implementation support for externally-managed work areas with multi-gpu usage"); #endif } else @@ -1637,6 +1630,8 @@ class hipfft_params : public fft_params case fft_precision_double: ret = HIP_C_64F; break; + default: + throw std::runtime_error("Invalid precision"); } return ret; } diff --git a/clients/tests/hipfft_accuracy_test.cpp b/clients/tests/hipfft_accuracy_test.cpp index 25c809c6..c537251d 100644 --- a/clients/tests/hipfft_accuracy_test.cpp +++ b/clients/tests/hipfft_accuracy_test.cpp @@ -95,6 +95,10 @@ TEST_P(accuracy_test, vs_fftw) { GTEST_SKIP() << e.msg; } + catch(const fft_params::unimplemented_exception& e) + { + GTEST_SKIP() << "Unimplemented exception: " << e.what(); + } catch(ROCFFT_FAIL& e) { GTEST_FAIL() << e.msg; diff --git a/clients/tests/multi_device_test.cpp b/clients/tests/multi_device_test.cpp index 4dc9500c..3ef05507 100644 --- a/clients/tests/multi_device_test.cpp +++ b/clients/tests/multi_device_test.cpp @@ -189,7 +189,6 @@ std::vector param_generator_multi_gpu(const std::optional // in-place transforms require identical input/output layouts if(p.placement == fft_placement_inplace && input_grid != output_grid) continue; - all_params.push_back(std::move(p_dist)); } } @@ -236,7 +235,9 @@ INSTANTIATE_TEST_SUITE_P(multi_gpu, ::testing::ValuesIn(param_generator_multi_gpu({})), accuracy_test::TestName); -INSTANTIATE_TEST_SUITE_P(DISABLED_multi_gpu, +// Note: disabled for now due to implementation issues and +// unimplemented features in hipFFT (to fix first) +INSTANTIATE_TEST_SUITE_P(DISABLED_various_multi_gpu, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu({}, fft_auto_allocation_off)), diff --git a/shared/fft_params.h b/shared/fft_params.h index e3359b4d..3abe34b4 100644 --- a/shared/fft_params.h +++ b/shared/fft_params.h @@ -2289,6 +2289,15 @@ class fft_params } }; + // Specific exception type for unimplemented feature(s). + struct unimplemented_exception : public std::runtime_error + { + unimplemented_exception(const std::string& s) + : std::runtime_error(s) + { + } + }; + virtual fft_status create_plan() { return fft_status_success; From bc6708a5bfa39bd1b64ad198e7714ab239c976b1 Mon Sep 17 00:00:00 2001 From: Raphael Egan Date: Thu, 17 Jul 2025 15:50:38 +0000 Subject: [PATCH 5/5] 'HipFFT' -> 'hipFFT' --- clients/bench/bench.cpp | 2 +- clients/tests/gtest_main.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clients/bench/bench.cpp b/clients/bench/bench.cpp index 2bc4a3e3..52cf8883 100644 --- a/clients/bench/bench.cpp +++ b/clients/bench/bench.cpp @@ -84,7 +84,7 @@ int main(int argc, char* argv[]) non_token ->add_option("--auto_allocation", params.auto_allocate, - "HipFFT's auto-allocation behavior: \"on\", \"off\", or \"default\"") + "hipFFT's auto-allocation behavior: \"on\", \"off\", or \"default\"") ->default_val("default"); non_token ->add_option( diff --git a/clients/tests/gtest_main.cpp b/clients/tests/gtest_main.cpp index 7aaab7a3..e12dbdc3 100644 --- a/clients/tests/gtest_main.cpp +++ b/clients/tests/gtest_main.cpp @@ -346,7 +346,7 @@ int main(int argc, char* argv[]) non_token ->add_option("--auto_allocation", manual_params.auto_allocate, - "HipFFT's auto-allocation behavior: \"on\", \"off\", or \"default\"") + "hipFFT's auto-allocation behavior: \"on\", \"off\", or \"default\"") ->default_val("default"); non_token ->add_flag("--double", "Double precision transform (deprecated: use --precision double)")