From 675fe00a3f07b2a51fdb94b892777556b655ab52 Mon Sep 17 00:00:00 2001
From: Raphael Egan <Raphael.Egan@amd.com>
Date: Wed, 25 Jun 2025 21:54:37 +0000
Subject: [PATCH 1/5] Enable (and test for) externally-managed workareas via
 hipfft_params

---
 clients/hipfft_params.h             | 419 +++++++++++++++++++++++++---
 clients/tests/accuracy_test_1D.cpp  |  35 ++-
 clients/tests/accuracy_test_2D.cpp  |  21 ++
 clients/tests/accuracy_test_3D.cpp  |  22 ++
 clients/tests/gtest_main.cpp        |   8 +
 clients/tests/multi_device_test.cpp |  18 +-
 library/src/amd_detail/hipfft.cpp   |  58 ++--
 shared/accuracy_test.h              |   4 +-
 shared/fft_params.h                 |  68 ++++-
 shared/params_gen.h                 |  54 +++-
 shared/rocfft_params.h              |   6 +-
 11 files changed, 588 insertions(+), 125 deletions(-)
diff --git a/clients/hipfft_params.h b/clients/hipfft_params.h
index 175f5f20..c707f1ac 100644
--- a/clients/hipfft_params.h
+++ b/clients/hipfft_params.h
@@ -182,21 +182,43 @@ class hipfft_params : public fft_params
 
     // backend library can write N worksize values for N GPUs, so
     // allocate a vector for that if necessary
-    std::vector<size_t> xt_worksize;
+    std::vector<size_t> auto_allocated_worksizes;
+    // if auto_allocate == fft_auto_allocation_off, the hipFFT plan(s)
+    // will be provided with externally-managed work area(s):
+    static std::vector<gpubuf> externally_managed_workareas;
 
-    // pointer we pass to the backend library.  By default point to the
-    // single-GPU workbuffer size.
-    size_t* workbuffersize_ptr;
+    size_t auto_allocated_extra_vram_footprint() const
+    {
+        return std::accumulate(auto_allocated_worksizes.begin(),
+                               auto_allocated_worksizes.end(),
+                               static_cast<size_t>(0));
+    }
 
-    hipfft_params()
+    static size_t externally_managed_extra_vram_footprint()
     {
-        workbuffersize_ptr = &workbuffersize;
+        return std::accumulate(externally_managed_workareas.begin(),
+                               externally_managed_workareas.end(),
+                               static_cast<size_t>(0),
+                               [](size_t total, const gpubuf& buf) { return total + buf.size(); });
     }
 
+    bool is_preventing_auto_allocation_at_generation() const
+    {
+        if(auto_allocate != fft_auto_allocation_off)
+            return false;
+        // Let hipFFT sometimes auto-allocate nonetheless so that tests cover its
+        // ability to free resources (allocated at generation) when/if some
+        // externally-managed workarea(s) are provided after plan generation
+        // Note: this member function must return the same result even if called
+        // more than once by a given instance, it must be stable for any instance
+        return (random_seed + std::hash<std::string>()(token())) % 2 == 1;
+    }
+
+    hipfft_params() = default;
+
     hipfft_params(const fft_params& p)
         : fft_params(p)
     {
-        workbuffersize_ptr = &workbuffersize;
     }
 
     ~hipfft_params()
@@ -228,12 +250,14 @@ class hipfft_params : public fft_params
         }
         catch(fft_params::work_buffer_alloc_failure& e)
         {
-            val += workbuffersize;
+            val += auto_allocated_extra_vram_footprint();
+            val += externally_managed_extra_vram_footprint();
             std::stringstream msg;
             msg << "Plan work buffer size (" << val << " bytes raw data) too large for device";
             throw ROCFFT_SKIP{msg.str()};
         }
-        val += workbuffersize;
+        val += auto_allocated_extra_vram_footprint();
+        val += externally_managed_extra_vram_footprint();
         return val;
     }
 
@@ -358,6 +382,11 @@ class hipfft_params : public fft_params
             int_inembed[i] = ll_inembed[i];
             int_onembed[i] = ll_onembed[i];
         }
+        // reset auto_allocated_worksizes
+        auto_allocated_worksizes.resize(get_num_used_gpus());
+        std::for_each(auto_allocated_worksizes.begin(),
+                      auto_allocated_worksizes.end(),
+                      [](decltype(auto_allocated_worksizes)::value_type& val) { val = 0; });
 
         hipfftResult ret = HIPFFT_SUCCESS;
         return fft_status_from_hipfftparams(ret);
@@ -424,13 +453,34 @@ class hipfft_params : public fft_params
         }
         }
 
+        if(ret == HIPFFT_SUCCESS && auto_allocate == fft_auto_allocation_off)
+        {
+            ret = set_externally_managed_work_areas();
+        }
+
         // hipFFT can fail plan creation due to allocation failure -
         // tests are expecting a specific exception in that case,
         // because the test was unable to run.  Doesn't mean the test
         // case failed.
         if(ret == HIPFFT_ALLOC_FAILED)
-            throw fft_params::work_buffer_alloc_failure(
-                "plan create failed due to allocation failure");
+        {
+            if(!final_attempt_at_plan_creation && externally_managed_extra_vram_footprint() > 0)
+            {
+                final_attempt_at_plan_creation = true;
+                // device allocation(s) in externally_managed_workareas might be
+                // larger than needed or even unnecessary for the instance of interest.
+                // Free them up and try again before concluding.
+                externally_managed_workareas.clear();
+                return create_plan();
+            }
+            else
+            {
+                throw fft_params::work_buffer_alloc_failure(
+                    "plan create failed due to allocation failure",
+                    externally_managed_extra_vram_footprint()
+                        + auto_allocated_extra_vram_footprint());
+            }
+        }
 
         // store token to check if plan was already made
         current_token = token();
@@ -439,6 +489,13 @@ class hipfft_params : public fft_params
 
     void validate_fields() const override
     {
+        if(multiGPU > 1 && auto_allocate == fft_auto_allocation_off)
+        {
+            // hipfftXtSetWorkArea would be required
+            throw std::runtime_error(
+                "cannot request externally-managed work areas with multi-gpu usage");
+        }
+
         validate_brick_volume();
 
         // multi-process only works with batch-1 FFTs, as hipFFT has
@@ -1005,13 +1062,251 @@ class hipfft_params : public fft_params
         CREATE_XT_MAKE_PLAN_MANY,
     };
 
+    // check that worksize estimates can be successfully queried with or without a valid plan
+    hipfftResult_t check_worksize_estimate()
+    {
+        hipfftResult_t ret{HIPFFT_INTERNAL_ERROR};
+        if(!hipfft_transform_type)
+        {
+            throw std::runtime_error("Estimating worksize requires a valid type of transform");
+        }
+        std::vector<size_t> worksize_estimate(get_num_used_gpus(), absurd_init_worksize_estimate);
+        switch(get_create_type())
+        {
+        case CREATE_MAKE_PLAN_Nd:
+        {
+            switch(dim())
+            {
+            case 1:
+                if(plan == INVALID_PLAN_HANDLE)
+                    ret = hipfftEstimate1d(
+                        int_length[0], *hipfft_transform_type, nbatch, worksize_estimate.data());
+                else
+                    ret = hipfftGetSize1d(plan,
+                                          int_length[0],
+                                          *hipfft_transform_type,
+                                          nbatch,
+                                          worksize_estimate.data());
+                break;
+            case 2:
+                if(plan == INVALID_PLAN_HANDLE)
+                    ret = hipfftEstimate2d(int_length[0],
+                                           int_length[1],
+                                           *hipfft_transform_type,
+                                           worksize_estimate.data());
+                else
+                    ret = hipfftGetSize2d(plan,
+                                          int_length[0],
+                                          int_length[1],
+                                          *hipfft_transform_type,
+                                          worksize_estimate.data());
+                break;
+            case 3:
+                if(plan == INVALID_PLAN_HANDLE)
+                    ret = hipfftEstimate3d(int_length[0],
+                                           int_length[1],
+                                           int_length[2],
+                                           *hipfft_transform_type,
+                                           worksize_estimate.data());
+                else
+                    ret = hipfftGetSize3d(plan,
+                                          int_length[0],
+                                          int_length[1],
+                                          int_length[2],
+                                          *hipfft_transform_type,
+                                          worksize_estimate.data());
+                break;
+            default:
+                throw std::runtime_error("invalid dim");
+            }
+            break;
+        }
+        case CREATE_MAKE_PLAN_MANY:
+        {
+            auto layout_args = make_valid_layout_args_for_plan_many<int>();
+            if(plan == INVALID_PLAN_HANDLE)
+                ret = hipfftEstimateMany(dim(),
+                                         int_length.data(),
+                                         layout_args.input_embed,
+                                         layout_args.input_stride,
+                                         layout_args.input_distance,
+                                         layout_args.output_embed,
+                                         layout_args.output_stride,
+                                         layout_args.output_distance,
+                                         *hipfft_transform_type,
+                                         nbatch,
+                                         worksize_estimate.data());
+            else
+                ret = hipfftGetSizeMany(plan,
+                                        dim(),
+                                        int_length.data(),
+                                        layout_args.input_embed,
+                                        layout_args.input_stride,
+                                        layout_args.input_distance,
+                                        layout_args.output_embed,
+                                        layout_args.output_stride,
+                                        layout_args.output_distance,
+                                        *hipfft_transform_type,
+                                        nbatch,
+                                        worksize_estimate.data());
+            break;
+        }
+        case CREATE_MAKE_PLAN_MANY64:
+        {
+            if(plan == INVALID_PLAN_HANDLE)
+            {
+                // no direct equivalent in estimate-fetching APIs
+                std::for_each(worksize_estimate.begin(),
+                              worksize_estimate.end(),
+                              [](decltype(worksize_estimate)::value_type& val) { val = 0; });
+                ret = HIPFFT_SUCCESS;
+            }
+            else
+            {
+                auto layout_args = make_valid_layout_args_for_plan_many<long long>();
+                ret              = hipfftGetSizeMany64(plan,
+                                          dim(),
+                                          ll_length.data(),
+                                          layout_args.input_embed,
+                                          layout_args.input_stride,
+                                          layout_args.input_distance,
+                                          layout_args.output_embed,
+                                          layout_args.output_stride,
+                                          layout_args.output_distance,
+                                          *hipfft_transform_type,
+                                          nbatch,
+                                          worksize_estimate.data());
+            }
+            break;
+        }
+        case CREATE_XT_MAKE_PLAN_MANY:
+        {
+            if(plan == INVALID_PLAN_HANDLE)
+            {
+                // no direct equivalent in estimate-fetching APIs
+                std::for_each(worksize_estimate.begin(),
+                              worksize_estimate.end(),
+                              [](decltype(worksize_estimate)::value_type& val) { val = 0; });
+                ret = HIPFFT_SUCCESS;
+            }
+            else
+            {
+                auto executionType = get_xt_api_execution_type();
+                auto layout_args   = make_valid_layout_args_for_plan_many<long long>();
+                ret                = hipfftXtGetSizeMany(plan,
+                                          dim(),
+                                          ll_length.data(),
+                                          layout_args.input_embed,
+                                          layout_args.input_stride,
+                                          layout_args.input_distance,
+                                          inputType,
+                                          layout_args.output_embed,
+                                          layout_args.output_stride,
+                                          layout_args.output_distance,
+                                          outputType,
+                                          nbatch,
+                                          worksize_estimate.data(),
+                                          executionType);
+            }
+            break;
+        }
+        case PLAN_Nd:
+        case PLAN_MANY:
+        default:
+        {
+            // should be indirectly disabled via get_create_type()
+            return HIPFFT_INTERNAL_ERROR;
+        }
+        }
+        // check that the value(s) of worksize_estimate were actually set, assuming that
+        // setting a worksize_estimate equal to absurd_init_worksize_estimate by hipFFT
+        // cannot be considered "correct".
+        // Note: worksize_estimate value(s) are *not* guaranteed to be greater than or equal
+        // to the actual value(s) of the work area(s), queriable after plan generation via
+        // hipfftGetSize.
+        if(ret == HIPFFT_SUCCESS)
+        {
+            // the estimate can't have any knowledge about the number of GPUs being used if
+            // the plan wasn't created first
+            const size_t num_values_to_check
+                = plan == INVALID_PLAN_HANDLE ? 1 : worksize_estimate.size();
+            for(auto idx = 0; ret == HIPFFT_SUCCESS && idx < num_values_to_check; idx++)
+            {
+                ret = worksize_estimate[idx] != absurd_init_worksize_estimate
+                          ? HIPFFT_SUCCESS
+                          : HIPFFT_INTERNAL_ERROR;
+            }
+        }
+        return ret;
+    }
+
+    // provide a work area to a successfully generated plan
+    hipfftResult_t set_externally_managed_work_areas()
+    {
+        std::vector<size_t> req_workarea_sizes(get_num_used_gpus(), absurd_init_worksize_estimate);
+        hipfftResult_t      ret = hipfftGetSize(plan, req_workarea_sizes.data());
+        if(ret != HIPFFT_SUCCESS)
+        {
+            return ret;
+        }
+        else if(std::any_of(req_workarea_sizes.begin(),
+                            req_workarea_sizes.end(),
+                            [](const decltype(req_workarea_sizes)::value_type& val) {
+                                return val == absurd_init_worksize_estimate;
+                            }))
+        {
+            return HIPFFT_INTERNAL_ERROR;
+        }
+        // req_workarea_sizes are known and validated
+        // check if the current externally_managed_workareas can be used as is or not
+        if(externally_managed_workareas.size() < get_num_used_gpus())
+            externally_managed_workareas.resize(get_num_used_gpus());
+        std::vector<void*> workareas(get_num_used_gpus(), nullptr);
+        for(auto workarea_idx = 0; workarea_idx < get_num_used_gpus(); workarea_idx++)
+        {
+            const auto req_size = req_workarea_sizes[workarea_idx];
+            auto&      buf      = externally_managed_workareas[workarea_idx];
+            if(buf.size() < req_size)
+            {
+                // too small, free and reallocate to meet current needs
+                buf.free();
+                if(buf.alloc(req_size) != hipSuccess)
+                {
+                    return HIPFFT_ALLOC_FAILED;
+                }
+            }
+            workareas[workarea_idx] = buf.data();
+        }
+        if(get_num_used_gpus() > 1)
+        {
+            // TODO: enable below once hipfftXtSetWorkArea is enabled
+#if(0)
+            ret = hipfftXtSetWorkArea(plan, workareas.data);
+#else
+            throw std::runtime_error(
+                "cannot request externally-managed work areas with multi-gpu usage");
+#endif
+        }
+        else
+        {
+            ret = hipfftSetWorkArea(plan, workareas[0]);
+        }
+        if(ret == HIPFFT_SUCCESS)
+        {
+            // the above "SetWorkArea" frees auto_allocated worksizes (if any)
+            auto_allocated_worksizes.clear();
+        }
+        return ret;
+    }
+
     // return true if we need to use hipFFT APIs that separate plan
     // allocation and plan init
     bool need_separate_create_make() const
     {
-        // scale factor and multi-GPU need API calls between create +
-        // init
-        if(scale_factor != 1.0 || multiGPU > 1 || mp_lib != fft_mp_lib_none)
+        // scale factor and multi-GPU and disabled auto-allocation need API
+        // calls between create + init
+        if(scale_factor != 1.0 || multiGPU > 1 || mp_lib != fft_mp_lib_none
+           || auto_allocate == fft_auto_allocation_off)
             return true;
         return false;
     }
@@ -1145,7 +1440,14 @@ class hipfft_params : public fft_params
     // relevant pre-Make APIs (scale factor, XtSetGPUs)
     hipfftResult_t create_with_pre_make()
     {
-        auto ret = hipfftCreate(&plan);
+        hipfftResult_t ret{HIPFFT_INVALID_PLAN};
+        if(auto_allocate == fft_auto_allocation_off)
+        {
+            ret = check_worksize_estimate(); // read worksize estimate before plan creation
+            if(ret != HIPFFT_SUCCESS)
+                return ret;
+        }
+        ret = hipfftCreate(&plan);
         if(ret != HIPFFT_SUCCESS)
             return ret;
         if(scale_factor != 1.0)
@@ -1157,7 +1459,8 @@ class hipfft_params : public fft_params
         if(multiGPU > 1)
         {
             int deviceCount = 0;
-            (void)hipGetDeviceCount(&deviceCount);
+            if(hipGetDeviceCount(&deviceCount) != hipSuccess)
+                throw std::runtime_error("hipGetDeviceCount failed");
 
             // ensure that users request less than or equal to the total number of devices
             if(static_cast<int>(multiGPU) > deviceCount)
@@ -1166,9 +1469,8 @@ class hipfft_params : public fft_params
             std::vector<int> GPUs(multiGPU);
             std::iota(GPUs.begin(), GPUs.end(), 0);
             ret = hipfftXtSetGPUs(plan, static_cast<int>(multiGPU), GPUs.data());
-
-            xt_worksize.resize(GPUs.size());
-            workbuffersize_ptr = xt_worksize.data();
+            if(ret != HIPFFT_SUCCESS)
+                return ret;
         }
         if(mp_lib == fft_mp_lib_mpi)
         {
@@ -1221,10 +1523,23 @@ class hipfft_params : public fft_params
                                               input_stride.data(),
                                               output_stride.data());
             }
+            if(ret != HIPFFT_SUCCESS)
+                return ret;
 #else
             throw std::runtime_error("MPI is not enabled");
 #endif
         }
+        if(auto_allocate == fft_auto_allocation_off)
+        {
+            ret = check_worksize_estimate(); // read worksize estimate again after plan creation
+            if(ret != HIPFFT_SUCCESS)
+                return ret;
+        }
+        if(is_preventing_auto_allocation_at_generation())
+        {
+            ret = hipfftSetAutoAllocation(plan, 0);
+        }
+
         return ret;
     }
 
@@ -1233,22 +1548,25 @@ class hipfft_params : public fft_params
         auto ret = create_with_pre_make();
         if(ret != HIPFFT_SUCCESS)
             return ret;
-
+        // do not register plan's worksizes as "auto-allocated" if auto-allocation was explicitly prevented
+        size_t* worksize_ptr = is_preventing_auto_allocation_at_generation()
+                                   ? nullptr
+                                   : auto_allocated_worksizes.data();
         switch(dim())
         {
         case 1:
             return hipfftMakePlan1d(
-                plan, int_length[0], *hipfft_transform_type, nbatch, workbuffersize_ptr);
+                plan, int_length[0], *hipfft_transform_type, nbatch, worksize_ptr);
         case 2:
             return hipfftMakePlan2d(
-                plan, int_length[0], int_length[1], *hipfft_transform_type, workbuffersize_ptr);
+                plan, int_length[0], int_length[1], *hipfft_transform_type, worksize_ptr);
         case 3:
             return hipfftMakePlan3d(plan,
                                     int_length[0],
                                     int_length[1],
                                     int_length[2],
                                     *hipfft_transform_type,
-                                    workbuffersize_ptr);
+                                    worksize_ptr);
         default:
             throw std::runtime_error("invalid dim");
         }
@@ -1259,7 +1577,11 @@ class hipfft_params : public fft_params
         auto ret = create_with_pre_make();
         if(ret != HIPFFT_SUCCESS)
             return ret;
-        auto layout_args = make_valid_layout_args_for_plan_many<int>();
+        // do not register plan's worksizes as "auto-allocated" if auto-allocation was explicitly prevented
+        size_t* worksize_ptr = is_preventing_auto_allocation_at_generation()
+                                   ? nullptr
+                                   : auto_allocated_worksizes.data();
+        auto    layout_args  = make_valid_layout_args_for_plan_many<int>();
         return hipfftMakePlanMany(plan,
                                   dim(),
                                   int_length.data(),
@@ -1271,7 +1593,7 @@ class hipfft_params : public fft_params
                                   layout_args.output_distance,
                                   *hipfft_transform_type,
                                   nbatch,
-                                  workbuffersize_ptr);
+                                  worksize_ptr);
     }
 
     hipfftResult_t create_make_plan_many64()
@@ -1279,7 +1601,12 @@ class hipfft_params : public fft_params
         auto ret = create_with_pre_make();
         if(ret != HIPFFT_SUCCESS)
             return ret;
-        auto layout_args = make_valid_layout_args_for_plan_many<long long int>();
+
+        // do not register plan's worksizes as "auto-allocated" if auto-allocation was explicitly prevented
+        size_t* worksize_ptr = is_preventing_auto_allocation_at_generation()
+                                   ? nullptr
+                                   : auto_allocated_worksizes.data();
+        auto    layout_args  = make_valid_layout_args_for_plan_many<long long int>();
         return hipfftMakePlanMany64(plan,
                                     dim(),
                                     ll_length.data(),
@@ -1291,33 +1618,42 @@ class hipfft_params : public fft_params
                                     layout_args.output_distance,
                                     *hipfft_transform_type,
                                     nbatch,
-                                    workbuffersize_ptr);
+                                    worksize_ptr);
     }
 
-    hipfftResult_t create_xt_make_plan_many()
+    hipDataType get_xt_api_execution_type() const
     {
-        auto ret = create_with_pre_make();
-        if(ret != HIPFFT_SUCCESS)
-            return ret;
-
         // execution type is always complex, matching the precision
         // of the transform
         // Initializing as double by default
-        hipDataType executionType = HIP_C_64F;
+        hipDataType ret = HIP_C_64F;
         switch(precision)
         {
         case fft_precision_half:
-            executionType = HIP_C_16F;
+            ret = HIP_C_16F;
             break;
         case fft_precision_single:
-            executionType = HIP_C_32F;
+            ret = HIP_C_32F;
             break;
         case fft_precision_double:
-            executionType = HIP_C_64F;
+            ret = HIP_C_64F;
             break;
         }
+        return ret;
+    }
+
+    hipfftResult_t create_xt_make_plan_many()
+    {
+        auto ret = create_with_pre_make();
+        if(ret != HIPFFT_SUCCESS)
+            return ret;
 
-        auto layout_args = make_valid_layout_args_for_plan_many<long long int>();
+        // do not register plan's worksizes as "auto-allocated" if auto-allocation was explicitly prevented
+        size_t* worksize_ptr  = is_preventing_auto_allocation_at_generation()
+                                    ? nullptr
+                                    : auto_allocated_worksizes.data();
+        auto    executionType = get_xt_api_execution_type();
+        auto    layout_args   = make_valid_layout_args_for_plan_many<long long int>();
         return hipfftXtMakePlanMany(plan,
                                     dim(),
                                     ll_length.data(),
@@ -1330,9 +1666,16 @@ class hipfft_params : public fft_params
                                     layout_args.output_distance,
                                     outputType,
                                     nbatch,
-                                    workbuffersize_ptr,
+                                    worksize_ptr,
                                     executionType);
     }
+    static constexpr size_t absurd_init_worksize_estimate  = std::numeric_limits<size_t>::max();
+    bool                    final_attempt_at_plan_creation = false;
+
+    size_t get_num_used_gpus() const
+    {
+        return multiGPU > 1 ? multiGPU : 1;
+    };
 };
 
 #endif
diff --git a/clients/tests/accuracy_test_1D.cpp b/clients/tests/accuracy_test_1D.cpp
index 5593ec4d..b2aa382a 100644
--- a/clients/tests/accuracy_test_1D.cpp
+++ b/clients/tests/accuracy_test_1D.cpp
@@ -81,14 +81,8 @@ static std::vector<size_t> small_1D_sizes()
     static const size_t SMALL_1D_MAX = 8192;
 
     // generate a list of sizes from 2 and up, skipping any sizes that are already covered
-    std::vector<size_t> covered_sizes;
-    std::copy(pow2_range.begin(), pow2_range.end(), std::back_inserter(covered_sizes));
-    std::copy(pow3_range.begin(), pow3_range.end(), std::back_inserter(covered_sizes));
-    std::copy(pow5_range.begin(), pow5_range.end(), std::back_inserter(covered_sizes));
-    std::copy(radX_range.begin(), radX_range.end(), std::back_inserter(covered_sizes));
-    std::copy(mix_range.begin(), mix_range.end(), std::back_inserter(covered_sizes));
-    std::copy(prime_range.begin(), prime_range.end(), std::back_inserter(covered_sizes));
-    std::sort(covered_sizes.begin(), covered_sizes.end());
+    std::vector<size_t> covered_sizes = merge_and_sort_values<size_t>(
+        {pow2_range, pow3_range, pow5_range, radX_range, mix_range, prime_range});
 
     std::vector<size_t> output;
     for(size_t i = 2; i < SMALL_1D_MAX; ++i)
@@ -319,12 +313,14 @@ INSTANTIATE_TEST_SUITE_P(DISABLED_offset_mix_1D,
 
 // small 1D sizes just need to make sure our factorization isn't
 // completely broken, so we just check simple C2C outplace interleaved
+const static std::vector<size_t> small_1D_lengths = small_1D_sizes();
+
 INSTANTIATE_TEST_SUITE_P(
     small_1D,
     accuracy_test,
     ::testing::ValuesIn(param_generator_base(test_prob,
                                              {fft_transform_type_complex_forward},
-                                             generate_lengths({small_1D_sizes()}),
+                                             generate_lengths({small_1D_lengths}),
                                              {fft_precision_single},
                                              {1},
                                              generate_types,
@@ -530,3 +526,24 @@ INSTANTIATE_TEST_SUITE_P(
                                                               ooffset_range_zero,
                                                               place_range)),
     accuracy_test::TestName);
+
+const static std::vector<size_t> lengths_for_disabled_autoalloc = merge_and_sort_values<size_t>(
+    {pow2_range, pow3_range, pow5_range, radX_range, mix_range, small_1D_lengths, prime_range},
+    128);
+
+INSTANTIATE_TEST_SUITE_P(
+    various_1D,
+    accuracy_test,
+    ::testing::ValuesIn(param_generator(test_prob,
+                                        generate_lengths({lengths_for_disabled_autoalloc}),
+                                        precision_range_sp_dp,
+                                        batch_range_1D,
+                                        stride_range,
+                                        stride_range,
+                                        ioffset_range_zero,
+                                        ooffset_range_zero,
+                                        place_range,
+                                        false,
+                                        false,
+                                        fft_auto_allocation_off)),
+    accuracy_test::TestName);
diff --git a/clients/tests/accuracy_test_2D.cpp b/clients/tests/accuracy_test_2D.cpp
index 5f1b8a98..6f45abe4 100644
--- a/clients/tests/accuracy_test_2D.cpp
+++ b/clients/tests/accuracy_test_2D.cpp
@@ -278,3 +278,24 @@ INSTANTIATE_TEST_SUITE_P(len1_swap_2D,
                              false,
                              false)),
                          accuracy_test::TestName);
+
+const static std::vector<size_t> lengths_for_disabled_autoalloc
+    = merge_and_sort_values<size_t>({pow2_range, pow3_range, prime_range, mix_range}, 12);
+
+INSTANTIATE_TEST_SUITE_P(
+    various_2D,
+    accuracy_test,
+    ::testing::ValuesIn(param_generator(test_prob,
+                                        generate_lengths({lengths_for_disabled_autoalloc,
+                                                          lengths_for_disabled_autoalloc}),
+                                        precision_range_sp_dp,
+                                        batch_range,
+                                        stride_range,
+                                        stride_range,
+                                        ioffset_range_zero,
+                                        ooffset_range_zero,
+                                        place_range,
+                                        false,
+                                        false,
+                                        fft_auto_allocation_off)),
+    accuracy_test::TestName);
diff --git a/clients/tests/accuracy_test_3D.cpp b/clients/tests/accuracy_test_3D.cpp
index f3386950..f3f780de 100644
--- a/clients/tests/accuracy_test_3D.cpp
+++ b/clients/tests/accuracy_test_3D.cpp
@@ -284,3 +284,25 @@ INSTANTIATE_TEST_SUITE_P(
         false,
         false)),
     accuracy_test::TestName);
+
+const static std::vector<size_t> lengths_for_disabled_autoalloc = merge_and_sort_values<size_t>(
+    {pow2_range, pow3_range, pow5_range, prime_range, sbrc_range}, 5);
+
+INSTANTIATE_TEST_SUITE_P(
+    various_3D,
+    accuracy_test,
+    ::testing::ValuesIn(param_generator(test_prob,
+                                        generate_lengths({lengths_for_disabled_autoalloc,
+                                                          lengths_for_disabled_autoalloc,
+                                                          lengths_for_disabled_autoalloc}),
+                                        precision_range_sp_dp,
+                                        batch_range,
+                                        stride_range,
+                                        stride_range,
+                                        ioffset_range_zero,
+                                        ooffset_range_zero,
+                                        place_range,
+                                        false,
+                                        false,
+                                        fft_auto_allocation_off)),
+    accuracy_test::TestName);
diff --git a/clients/tests/gtest_main.cpp b/clients/tests/gtest_main.cpp
index e41718d4..1366ad3c 100644
--- a/clients/tests/gtest_main.cpp
+++ b/clients/tests/gtest_main.cpp
@@ -43,6 +43,9 @@
 #include "hipfft_accuracy_test.h"
 #include "hipfft_test_params.h"
 
+// initialize static class member of hipfft_params
+std::vector<gpubuf> hipfft_params::externally_managed_workareas = std::vector<gpubuf>();
+
 // Control output verbosity:
 int verbose;
 
@@ -340,6 +343,11 @@ int main(int argc, char* argv[])
     non_token->add_flag("--callback", "Inject load/store callbacks")->each([&](const std::string&) {
         manual_params.run_callbacks = true;
     });
+    non_token
+        ->add_option("--auto_allocation",
+                     manual_params.auto_allocate,
+                     "Backend library's auto-allocation behavior: \"on\", \"off\", or \"default\"")
+        ->default_val("default");
     non_token
         ->add_flag("--double", "Double precision transform (deprecated: use --precision double)")
         ->each([&](const std::string&) { manual_params.precision = fft_precision_double; });
diff --git a/clients/tests/multi_device_test.cpp b/clients/tests/multi_device_test.cpp
index 661d5791..4dc9500c 100644
--- a/clients/tests/multi_device_test.cpp
+++ b/clients/tests/multi_device_test.cpp
@@ -53,7 +53,9 @@ enum SplitType
     PENCIL_3D,
 };
 
-std::vector<fft_params> param_generator_multi_gpu(const std::optional<SplitType> type)
+std::vector<fft_params> param_generator_multi_gpu(const std::optional<SplitType> type,
+                                                  fft_auto_allocation            auto_alloc_setting
+                                                  = fft_auto_allocation_default)
 {
     int localDeviceCount = 0;
     (void)hipGetDeviceCount(&localDeviceCount);
@@ -80,7 +82,9 @@ std::vector<fft_params> param_generator_multi_gpu(const std::optional<SplitType>
                                                   ioffset_range_zero,
                                                   ooffset_range_zero,
                                                   place_range,
-                                                  false);
+                                                  false,
+                                                  false,
+                                                  auto_alloc_setting);
 
     auto params_real = param_generator_real(test_prob,
                                             multi_gpu_sizes,
@@ -91,7 +95,9 @@ std::vector<fft_params> param_generator_multi_gpu(const std::optional<SplitType>
                                             ioffset_range_zero,
                                             ooffset_range_zero,
                                             {fft_placement_notinplace},
-                                            false);
+                                            false,
+                                            false,
+                                            auto_alloc_setting);
 
     std::vector<fft_params> all_params;
 
@@ -229,3 +235,9 @@ INSTANTIATE_TEST_SUITE_P(multi_gpu,
                          accuracy_test,
                          ::testing::ValuesIn(param_generator_multi_gpu({})),
                          accuracy_test::TestName);
+
+INSTANTIATE_TEST_SUITE_P(DISABLED_multi_gpu,
+                         accuracy_test,
+                         ::testing::ValuesIn(param_generator_multi_gpu({},
+                                                                       fft_auto_allocation_off)),
+                         accuracy_test::TestName);
diff --git a/library/src/amd_detail/hipfft.cpp b/library/src/amd_detail/hipfft.cpp
index f4413600..4096550c 100644
--- a/library/src/amd_detail/hipfft.cpp
+++ b/library/src/amd_detail/hipfft.cpp
@@ -35,15 +35,6 @@
 #include "../../../shared/ptrdiff.h"
 #include "../../../shared/rocfft_hip.h"
 
-#define ROC_FFT_CHECK_ALLOC_FAILED(ret)   \
-    {                                     \
-        auto code = ret;                  \
-        if(code != rocfft_status_success) \
-        {                                 \
-            return HIPFFT_ALLOC_FAILED;   \
-        }                                 \
-    }
-
 #define ROC_FFT_CHECK_INVALID_VALUE(ret)  \
     {                                     \
         auto code = ret;                  \
@@ -410,31 +401,6 @@ catch(...)
     return handle_exception();
 }
 
-hipfftResult hipfftPlanMany64(hipfftHandle*  plan,
-                              int            rank,
-                              long long int* n,
-                              long long int* inembed,
-                              long long int  istride,
-                              long long int  idist,
-                              long long int* onembed,
-                              long long int  ostride,
-                              long long int  odist,
-                              hipfftType     type,
-                              long long int  batch)
-try
-{
-    hipfftHandle handle = nullptr;
-    HIP_FFT_CHECK_AND_RETURN(hipfftCreate(&handle));
-    *plan = handle;
-
-    return hipfftMakePlanMany64(
-        *plan, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch, nullptr);
-}
-catch(...)
-{
-    return handle_exception();
-}
-
 hipfftResult hipfftMakePlan_internal(hipfftHandle               plan,
                                      size_t                     dim,
                                      size_t*                    lengths,
@@ -1262,6 +1228,7 @@ try
 
     hipfftHandle p;
     HIP_FFT_CHECK_AND_RETURN(hipfftCreate(&p));
+    p->autoAllocate = false;
     HIP_FFT_CHECK_AND_RETURN(hipfftMakePlan1d(p, nx, type, batch, workSize));
     HIP_FFT_CHECK_AND_RETURN(hipfftDestroy(p));
 
@@ -1282,6 +1249,7 @@ try
 
     hipfftHandle p;
     HIP_FFT_CHECK_AND_RETURN(hipfftCreate(&p));
+    p->autoAllocate = false;
     HIP_FFT_CHECK_AND_RETURN(hipfftMakePlan2d(p, nx, ny, type, workSize));
     HIP_FFT_CHECK_AND_RETURN(hipfftDestroy(p));
 
@@ -1303,6 +1271,7 @@ try
 
     hipfftHandle p;
     HIP_FFT_CHECK_AND_RETURN(hipfftCreate(&p));
+    p->autoAllocate = false;
     HIP_FFT_CHECK_AND_RETURN(hipfftMakePlan3d(p, nx, ny, nz, type, workSize));
     HIP_FFT_CHECK_AND_RETURN(hipfftDestroy(p));
 
@@ -1327,10 +1296,13 @@ hipfftResult hipfftGetSizeMany(hipfftHandle plan,
                                size_t*      workSize)
 try
 {
-    hipfftHandle p;
-    HIP_FFT_CHECK_AND_RETURN(
-        hipfftPlanMany(&p, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch));
-    *workSize = p->workBufferSize;
+    if(workSize == nullptr)
+        return HIPFFT_INVALID_VALUE;
+    hipfftHandle p = nullptr;
+    HIP_FFT_CHECK_AND_RETURN(hipfftCreate(&p));
+    p->autoAllocate = false;
+    HIP_FFT_CHECK_AND_RETURN(hipfftMakePlanMany(
+        p, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch, workSize));
     HIP_FFT_CHECK_AND_RETURN(hipfftDestroy(p));
 
     return HIPFFT_SUCCESS;
@@ -1354,10 +1326,13 @@ hipfftResult hipfftGetSizeMany64(hipfftHandle   plan,
                                  size_t*        workSize)
 try
 {
+    if(workSize == nullptr)
+        return HIPFFT_INVALID_VALUE;
     hipfftHandle p = nullptr;
-    HIP_FFT_CHECK_AND_RETURN(hipfftPlanMany64(
-        &p, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch));
-    *workSize = p->workBufferSize;
+    HIP_FFT_CHECK_AND_RETURN(hipfftCreate(&p));
+    p->autoAllocate = false;
+    HIP_FFT_CHECK_AND_RETURN(hipfftMakePlanMany64(
+        p, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch, workSize));
     HIP_FFT_CHECK_AND_RETURN(hipfftDestroy(p));
 
     return HIPFFT_SUCCESS;
@@ -1831,6 +1806,7 @@ try
 
     hipfftHandle p;
     HIP_FFT_CHECK_AND_RETURN(hipfftCreate(&p));
+    p->autoAllocate = false;
 
     HIP_FFT_CHECK_AND_RETURN(hipfftMakePlanMany_internal(
         p, rank, n, inembed, istride, idist, onembed, ostride, odist, iotype, batch, workSize));
diff --git a/shared/accuracy_test.h b/shared/accuracy_test.h
index d3ea45f0..3194ec58 100644
--- a/shared/accuracy_test.h
+++ b/shared/accuracy_test.h
@@ -567,7 +567,7 @@ inline void run_round_trip_inverse(Tparams&              params,
     catch(fft_params::work_buffer_alloc_failure& e)
     {
         std::stringstream ss;
-        ss << "Failed to allocate work buffer (size: " << params.workbuffersize << ")";
+        ss << "Failed to allocate work buffer (size: " << e.attempted_size << ")";
         ++n_hip_failures;
         if(skip_runtime_fails)
         {
@@ -768,7 +768,7 @@ inline void fft_vs_reference_impl(Tparams& params, bool round_trip)
     {
         ++n_hip_failures;
         std::stringstream ss;
-        ss << "Work buffer allocation failed with size: " << params.workbuffersize;
+        ss << "Work buffer allocation failed with size: " << e.attempted_size;
         if(skip_runtime_fails)
         {
             throw ROCFFT_SKIP{ss.str()};
diff --git a/shared/fft_params.h b/shared/fft_params.h
index 4ee22e70..e3359b4d 100644
--- a/shared/fft_params.h
+++ b/shared/fft_params.h
@@ -74,7 +74,7 @@ enum fft_precision
     fft_precision_double,
 };
 
-// Used for CLI11 parsing of input gen enum
+// Used for CLI11 parsing of precision enum
 static bool lexical_cast(const std::string& word, fft_precision& precision)
 {
     if(word == "half")
@@ -88,6 +88,28 @@ static bool lexical_cast(const std::string& word, fft_precision& precision)
     return true;
 }
 
+enum fft_auto_allocation
+{
+    fft_auto_allocation_on,
+    fft_auto_allocation_off,
+    fft_auto_allocation_default
+};
+
+// Used for CLI11 parsing of auto-allocation enum
+static bool lexical_cast(const std::string& word, fft_auto_allocation& auto_allocation)
+{
+    if(word == "on")
+        auto_allocation = fft_auto_allocation_on;
+    else if(word == "off")
+        auto_allocation = fft_auto_allocation_off;
+    else if(word == "default")
+        auto_allocation = fft_auto_allocation_default;
+    else
+        throw std::runtime_error(
+            "Invalid auto-allocation behavior specified (choose \"on\", \"off\", or \"default\")");
+    return true;
+}
+
 // fft_input_generator: linearly spaced sequence in [-0.5,0.5]
 // fft_input_random_generator: pseudo-random sequence in [-0.5,0.5]
 enum fft_input_generator
@@ -491,7 +513,7 @@ class fft_params
     fft_input_generator igen = fft_input_random_generator_host;
 #endif
 
-    size_t workbuffersize = 0;
+    fft_auto_allocation auto_allocate = fft_auto_allocation_default;
 
     enum fft_mp_lib
     {
@@ -1038,6 +1060,12 @@ class fft_params
             ret += std::to_string(multiGPU);
         }
 
+        if(auto_allocate != fft_auto_allocation_default)
+        {
+            ret += "_autoallocation_";
+            ret += (auto_allocate == fft_auto_allocation_on ? "on" : "off");
+        }
+
         return ret;
     }
 
@@ -1197,6 +1225,13 @@ class fft_params
             ++pos;
             multiGPU = std::stoull(vals[pos++]);
         }
+
+        auto_allocate = fft_auto_allocation_default; // default if unspecified
+        if(pos < vals.size() && vals[pos] == "autoallocation")
+        {
+            ++pos;
+            lexical_cast(vals[pos++], auto_allocate);
+        }
     }
 
     // Stream output operator (for gtest, etc).
@@ -2246,8 +2281,10 @@ class fft_params
     // Tests that hit this can't fit on the GPU and should be skipped.
     struct work_buffer_alloc_failure : public std::runtime_error
     {
-        work_buffer_alloc_failure(const std::string& s)
+        const size_t attempted_size;
+        work_buffer_alloc_failure(const std::string& s, size_t _attempted_size = 0)
             : std::runtime_error(s)
+            , attempted_size(_attempted_size)
         {
         }
     };
@@ -2272,18 +2309,19 @@ class fft_params
             throw std::runtime_error("Transform type not forward.");
         }
 
-        length    = params_forward.length;
-        istride   = params_forward.ostride;
-        ostride   = params_forward.istride;
-        nbatch    = params_forward.nbatch;
-        precision = params_forward.precision;
-        placement = params_forward.placement;
-        idist     = params_forward.odist;
-        odist     = params_forward.idist;
-        itype     = params_forward.otype;
-        otype     = params_forward.itype;
-        ioffset   = params_forward.ooffset;
-        ooffset   = params_forward.ioffset;
+        length        = params_forward.length;
+        istride       = params_forward.ostride;
+        ostride       = params_forward.istride;
+        nbatch        = params_forward.nbatch;
+        precision     = params_forward.precision;
+        placement     = params_forward.placement;
+        idist         = params_forward.odist;
+        odist         = params_forward.idist;
+        itype         = params_forward.otype;
+        otype         = params_forward.itype;
+        ioffset       = params_forward.ooffset;
+        ooffset       = params_forward.ioffset;
+        auto_allocate = params_forward.auto_allocate;
 
         run_callbacks = params_forward.run_callbacks;
 
diff --git a/shared/params_gen.h b/shared/params_gen.h
index eb6e77a6..b9644106 100644
--- a/shared/params_gen.h
+++ b/shared/params_gen.h
@@ -58,6 +58,27 @@ inline double hash_prob(const int seed, const std::string& token)
     return roll;
 }
 
+template <typename T, std::enable_if_t<std::is_arithmetic_v<T>, bool> = true>
+std::vector<T> merge_and_sort_values(const std::vector<std::vector<T>>& set_of_vecs,
+                                     size_t max_num_elem = std::numeric_limits<size_t>::max())
+{
+    std::vector<T> merged;
+    for(const auto& vec : set_of_vecs)
+    {
+        std::copy(vec.begin(), vec.end(), std::back_inserter(merged));
+    }
+    std::sort(merged.begin(), merged.end());
+    auto last_unique = std::unique(merged.begin(), merged.end());
+    merged.erase(last_unique, merged.end());
+    std::ranlux24_base gen(random_seed);
+    while(merged.size() > max_num_elem)
+    {
+        // remove pseudo-randomly chosen elements
+        merged.erase(merged.begin() + (static_cast<size_t>(gen()) % merged.size()));
+    }
+    return merged;
+}
+
 // Given a vector of vector of lengths, generate all unique permutations.
 // Add an optional vector of ad-hoc lengths to the result.
 inline std::vector<std::vector<size_t>>
@@ -248,7 +269,8 @@ inline auto param_generator_base(const double                             base_p
                                  const std::vector<std::vector<size_t>>&  ooffset_range,
                                  const std::vector<fft_result_placement>& place_range,
                                  const bool                               planar        = true,
-                                 const bool                               run_callbacks = false)
+                                 const bool                               run_callbacks = false,
+                                 const fft_auto_allocation auto_alloc = fft_auto_allocation_default)
 {
     std::vector<fft_params> params;
 
@@ -300,6 +322,7 @@ inline auto param_generator_base(const double                             base_p
                                             param.otype          = std::get<3>(types);
                                             param.ioffset        = ioffset;
                                             param.ooffset        = ooffset;
+                                            param.auto_allocate  = auto_alloc;
 
                                             if(run_callbacks)
                                             {
@@ -369,8 +392,8 @@ inline auto param_generator(const double                             base_prob,
                             const std::vector<std::vector<size_t>>&  ooffset_range,
                             const std::vector<fft_result_placement>& place_range,
                             const bool                               planar,
-
-                            const bool run_callbacks = false)
+                            const bool                               run_callbacks = false,
+                            const fft_auto_allocation auto_alloc = fft_auto_allocation_default)
 {
     return param_generator_base(base_prob,
                                 trans_type_range,
@@ -383,9 +406,9 @@ inline auto param_generator(const double                             base_prob,
                                 ioffset_range,
                                 ooffset_range,
                                 place_range,
-
                                 planar,
-                                run_callbacks);
+                                run_callbacks,
+                                auto_alloc);
 }
 
 // Create an array of parameters to pass to gtest.  Only tests complex-type transforms
@@ -398,9 +421,10 @@ inline auto param_generator_complex(const double                             bas
                                     const std::vector<std::vector<size_t>>&  ioffset_range,
                                     const std::vector<std::vector<size_t>>&  ooffset_range,
                                     const std::vector<fft_result_placement>& place_range,
-
-                                    const bool planar,
-                                    const bool run_callbacks = false)
+                                    const bool                               planar,
+                                    const bool                               run_callbacks = false,
+                                    const fft_auto_allocation                auto_alloc
+                                    = fft_auto_allocation_default)
 {
     return param_generator_base(base_prob,
                                 trans_type_range_complex,
@@ -413,9 +437,9 @@ inline auto param_generator_complex(const double                             bas
                                 ioffset_range,
                                 ooffset_range,
                                 place_range,
-
                                 planar,
-                                run_callbacks);
+                                run_callbacks,
+                                auto_alloc);
 }
 
 // Create an array of parameters to pass to gtest.
@@ -428,9 +452,9 @@ inline auto param_generator_real(const double                             base_p
                                  const std::vector<std::vector<size_t>>&  ioffset_range,
                                  const std::vector<std::vector<size_t>>&  ooffset_range,
                                  const std::vector<fft_result_placement>& place_range,
-
-                                 const bool planar,
-                                 const bool run_callbacks = false)
+                                 const bool                               planar,
+                                 const bool                               run_callbacks = false,
+                                 const fft_auto_allocation auto_alloc = fft_auto_allocation_default)
 {
     return param_generator_base(test_prob,
                                 trans_type_range_real,
@@ -443,9 +467,9 @@ inline auto param_generator_real(const double                             base_p
                                 ioffset_range,
                                 ooffset_range,
                                 place_range,
-
                                 planar,
-                                run_callbacks);
+                                run_callbacks,
+                                auto_alloc);
 }
 
 template <class Tcontainer>
diff --git a/shared/rocfft_params.h b/shared/rocfft_params.h
index 6f85785a..c84f60a3 100644
--- a/shared/rocfft_params.h
+++ b/shared/rocfft_params.h
@@ -170,6 +170,7 @@ class rocfft_params_base : public fft_params
     rocfft_execution_info   info = nullptr;
     rocfft_plan_description desc = nullptr;
     gpubuf_t<void>          wbuffer;
+    size_t                  workbuffersize = 0;
 
     explicit rocfft_params_base() = default;
 
@@ -410,7 +411,8 @@ class rocfft_params_base : public fft_params
         {
             return ret;
         }
-        if(workbuffersize > 0)
+        // default behavior is to feed rocfft with a work area if it needs one
+        if(workbuffersize > 0 && auto_allocate != fft_auto_allocation_on)
         {
             hipError_t hip_status = hipSuccess;
             hip_status            = wbuffer.alloc(workbuffersize);
@@ -429,7 +431,7 @@ class rocfft_params_base : public fft_params
                 {
                     oss << "hipMemGetInfo also failed";
                 }
-                throw work_buffer_alloc_failure(oss.str());
+                throw work_buffer_alloc_failure(oss.str(), workbuffersize);
             }
 
             auto rocret

From 2eda57c36705f2153020904b2c0329d38dd66a59 Mon Sep 17 00:00:00 2001
From: Raphael Egan <Raphael.Egan@amd.com>
Date: Mon, 7 Jul 2025 21:46:24 +0000
Subject: [PATCH 2/5] Remove dependency on external random_seed and repair
 build for hipfft-bench

---
 clients/bench/bench.cpp | 3 +++
 clients/hipfft_params.h | 5 ++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/clients/bench/bench.cpp b/clients/bench/bench.cpp
index c9cfed9e..f9b79272 100644
--- a/clients/bench/bench.cpp
+++ b/clients/bench/bench.cpp
@@ -31,6 +31,9 @@
 #include "../../shared/client_except.h"
 #include "../../shared/gpubuf.h"
 
+// initialize static class member of hipfft_params
+std::vector<gpubuf> hipfft_params::externally_managed_workareas = std::vector<gpubuf>();
+
 int main(int argc, char* argv[])
 {
     // This helps with mixing output of both wide and narrow characters to the screen
diff --git a/clients/hipfft_params.h b/clients/hipfft_params.h
index c707f1ac..248cd45c 100644
--- a/clients/hipfft_params.h
+++ b/clients/hipfft_params.h
@@ -30,7 +30,6 @@
 #include "../shared/concurrency.h"
 #include "../shared/fft_params.h"
 #include "../shared/hipfft_brick.h"
-#include "../shared/test_params.h"
 #include "hipfft/hipfft.h"
 #include "hipfft/hipfftXt.h"
 #include <random>
@@ -46,7 +45,7 @@ template <typename T,
 static void set_with_random_nonnegative_values(const std::string& token, T& val, Args&... args)
 {
     std::hash<std::string>           hasher;
-    std::ranlux24_base               gen(random_seed + hasher(token));
+    std::ranlux24_base               gen(hasher(token));
     std::uniform_int_distribution<T> dis(static_cast<T>(0), std::numeric_limits<T>::max());
     val = dis(gen);
     ((args = dis(gen)), ...);
@@ -211,7 +210,7 @@ class hipfft_params : public fft_params
         // externally-managed workarea(s) are provided after plan generation
         // Note: this member function must return the same result even if called
         // more than once by a given instance, it must be stable for any instance
-        return (random_seed + std::hash<std::string>()(token())) % 2 == 1;
+        return std::hash<std::string>()(token()) % 2 == 1;
     }
 
     hipfft_params() = default;

From f827ad67885fbf24e9853c73829212eb4333a3f4 Mon Sep 17 00:00:00 2001
From: Raphael Egan <Raphael.Egan@amd.com>
Date: Tue, 8 Jul 2025 16:53:53 +0000
Subject: [PATCH 3/5] Adding 'auto_allocation' option to hip-bench target and
 clarifying description thereof

---
 clients/bench/bench.cpp      | 5 +++++
 clients/tests/gtest_main.cpp | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/clients/bench/bench.cpp b/clients/bench/bench.cpp
index f9b79272..2bc4a3e3 100644
--- a/clients/bench/bench.cpp
+++ b/clients/bench/bench.cpp
@@ -81,6 +81,11 @@ int main(int argc, char* argv[])
                      "Type of transform:\n0) complex forward\n1) complex inverse\n2) real "
                      "forward\n3) real inverse")
         ->default_val(fft_transform_type_complex_forward);
+    non_token
+        ->add_option("--auto_allocation",
+                     params.auto_allocate,
+                     "HipFFT's auto-allocation behavior: \"on\", \"off\", or \"default\"")
+        ->default_val("default");
     non_token
         ->add_option(
             "--precision", params.precision, "Transform precision: single (default), double, half")
diff --git a/clients/tests/gtest_main.cpp b/clients/tests/gtest_main.cpp
index 1366ad3c..7aaab7a3 100644
--- a/clients/tests/gtest_main.cpp
+++ b/clients/tests/gtest_main.cpp
@@ -346,7 +346,7 @@ int main(int argc, char* argv[])
     non_token
         ->add_option("--auto_allocation",
                      manual_params.auto_allocate,
-                     "Backend library's auto-allocation behavior: \"on\", \"off\", or \"default\"")
+                     "HipFFT's auto-allocation behavior: \"on\", \"off\", or \"default\"")
         ->default_val("default");
     non_token
         ->add_flag("--double", "Double precision transform (deprecated: use --precision double)")

From 517f10110ec59fe381bb28662be9803dc42c4a97 Mon Sep 17 00:00:00 2001
From: Raphael Egan <Raphael.Egan@amd.com>
Date: Wed, 9 Jul 2025 22:56:34 +0000
Subject: [PATCH 4/5] Using ad-hoc exception type for unimplemented cases and
 avoiding silent failure for get_xt_api_execution_type in case of unexepected
 precision

---
 clients/hipfft_params.h                | 13 ++++---------
 clients/tests/hipfft_accuracy_test.cpp |  4 ++++
 clients/tests/multi_device_test.cpp    |  5 +++--
 shared/fft_params.h                    |  9 +++++++++
 4 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/clients/hipfft_params.h b/clients/hipfft_params.h
index 248cd45c..7c76fe06 100644
--- a/clients/hipfft_params.h
+++ b/clients/hipfft_params.h
@@ -488,13 +488,6 @@ class hipfft_params : public fft_params
 
     void validate_fields() const override
     {
-        if(multiGPU > 1 && auto_allocate == fft_auto_allocation_off)
-        {
-            // hipfftXtSetWorkArea would be required
-            throw std::runtime_error(
-                "cannot request externally-managed work areas with multi-gpu usage");
-        }
-
         validate_brick_volume();
 
         // multi-process only works with batch-1 FFTs, as hipFFT has
@@ -1282,8 +1275,8 @@ class hipfft_params : public fft_params
 #if(0)
             ret = hipfftXtSetWorkArea(plan, workareas.data);
 #else
-            throw std::runtime_error(
-                "cannot request externally-managed work areas with multi-gpu usage");
+            throw unimplemented_exception(
+                "No implementation support for externally-managed work areas with multi-gpu usage");
 #endif
         }
         else
@@ -1637,6 +1630,8 @@ class hipfft_params : public fft_params
         case fft_precision_double:
             ret = HIP_C_64F;
             break;
+        default:
+            throw std::runtime_error("Invalid precision");
         }
         return ret;
     }
diff --git a/clients/tests/hipfft_accuracy_test.cpp b/clients/tests/hipfft_accuracy_test.cpp
index 25c809c6..c537251d 100644
--- a/clients/tests/hipfft_accuracy_test.cpp
+++ b/clients/tests/hipfft_accuracy_test.cpp
@@ -95,6 +95,10 @@ TEST_P(accuracy_test, vs_fftw)
         {
             GTEST_SKIP() << e.msg;
         }
+        catch(const fft_params::unimplemented_exception& e)
+        {
+            GTEST_SKIP() << "Unimplemented exception: " << e.what();
+        }
         catch(ROCFFT_FAIL& e)
         {
             GTEST_FAIL() << e.msg;
diff --git a/clients/tests/multi_device_test.cpp b/clients/tests/multi_device_test.cpp
index 4dc9500c..3ef05507 100644
--- a/clients/tests/multi_device_test.cpp
+++ b/clients/tests/multi_device_test.cpp
@@ -189,7 +189,6 @@ std::vector<fft_params> param_generator_multi_gpu(const std::optional<SplitType>
                 // in-place transforms require identical input/output layouts
                 if(p.placement == fft_placement_inplace && input_grid != output_grid)
                     continue;
-
                 all_params.push_back(std::move(p_dist));
             }
         }
@@ -236,7 +235,9 @@ INSTANTIATE_TEST_SUITE_P(multi_gpu,
                          ::testing::ValuesIn(param_generator_multi_gpu({})),
                          accuracy_test::TestName);
 
-INSTANTIATE_TEST_SUITE_P(DISABLED_multi_gpu,
+// Note: disabled for now due to implementation issues and
+// unimplemented features in hipFFT (to fix first)
+INSTANTIATE_TEST_SUITE_P(DISABLED_various_multi_gpu,
                          accuracy_test,
                          ::testing::ValuesIn(param_generator_multi_gpu({},
                                                                        fft_auto_allocation_off)),
diff --git a/shared/fft_params.h b/shared/fft_params.h
index e3359b4d..3abe34b4 100644
--- a/shared/fft_params.h
+++ b/shared/fft_params.h
@@ -2289,6 +2289,15 @@ class fft_params
         }
     };
 
+    // Specific exception type for unimplemented feature(s).
+    struct unimplemented_exception : public std::runtime_error
+    {
+        unimplemented_exception(const std::string& s)
+            : std::runtime_error(s)
+        {
+        }
+    };
+
     virtual fft_status create_plan()
     {
         return fft_status_success;

From bc6708a5bfa39bd1b64ad198e7714ab239c976b1 Mon Sep 17 00:00:00 2001
From: Raphael Egan <Raphael.Egan@amd.com>
Date: Thu, 17 Jul 2025 15:50:38 +0000
Subject: [PATCH 5/5] 'HipFFT' -> 'hipFFT'

---
 clients/bench/bench.cpp      | 2 +-
 clients/tests/gtest_main.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/clients/bench/bench.cpp b/clients/bench/bench.cpp
index 2bc4a3e3..52cf8883 100644
--- a/clients/bench/bench.cpp
+++ b/clients/bench/bench.cpp
@@ -84,7 +84,7 @@ int main(int argc, char* argv[])
     non_token
         ->add_option("--auto_allocation",
                      params.auto_allocate,
-                     "HipFFT's auto-allocation behavior: \"on\", \"off\", or \"default\"")
+                     "hipFFT's auto-allocation behavior: \"on\", \"off\", or \"default\"")
         ->default_val("default");
     non_token
         ->add_option(
diff --git a/clients/tests/gtest_main.cpp b/clients/tests/gtest_main.cpp
index 7aaab7a3..e12dbdc3 100644
--- a/clients/tests/gtest_main.cpp
+++ b/clients/tests/gtest_main.cpp
@@ -346,7 +346,7 @@ int main(int argc, char* argv[])
     non_token
         ->add_option("--auto_allocation",
                      manual_params.auto_allocate,
-                     "HipFFT's auto-allocation behavior: \"on\", \"off\", or \"default\"")
+                     "hipFFT's auto-allocation behavior: \"on\", \"off\", or \"default\"")
         ->default_val("default");
     non_token
         ->add_flag("--double", "Double precision transform (deprecated: use --precision double)")