diff --git a/projects/miopen/CHANGELOG.md b/projects/miopen/CHANGELOG.md index 111a2b28817..c1f20273ac1 100644 --- a/projects/miopen/CHANGELOG.md +++ b/projects/miopen/CHANGELOG.md @@ -2,7 +2,9 @@ # Change Log for MIOpen Full documentation for MIOpen is available [here](https://rocm.docs.amd.com/projects/MIOpen/en/latest/) -## (Unreleased) MIOpen 3.5.1 for ROCm 8.0.0 +## (Unreleased) MIOpen 3.5.1 for ROCm 7.11.0 +### Optimized +* Added `MIOPEN_SEARCH_CUTOFF` option which can reduce tuning times by skipping slow solvers and kernels ## MIOpen 3.5.1 for ROCm 7.2.0 ### Changed diff --git a/projects/miopen/docs/reference/env_variables.rst b/projects/miopen/docs/reference/env_variables.rst index a1ee2f250a8..1c8e51f1a37 100644 --- a/projects/miopen/docs/reference/env_variables.rst +++ b/projects/miopen/docs/reference/env_variables.rst @@ -101,6 +101,11 @@ and :doc:`Performance database <../conceptual/perfdb>`. | "SEARCH_DB_UPDATE" or 4: Combination of DB_UPDATE and SEARCH (unsafe with Fast/Hybrid/Trust modes) | "DB_CLEAN" or 5: Remove optimized values from User PerfDb (unsafe with Fast/Hybrid/Trust modes) + * - | ``MIOPEN_SEARCH_CUTOFF`` + | Allows speculative early termination of suboptimal searches. + - | 1: Enable + | 0 or unset: Disable + * - | ``MIOPEN_DEBUG_DISABLE_FIND_DB`` | Disables FindDb functionality. - | 1: Disable FindDb diff --git a/projects/miopen/src/conv/solver_finders.cpp b/projects/miopen/src/conv/solver_finders.cpp index 1b7a39e0e98..e1f66e2d407 100644 --- a/projects/miopen/src/conv/solver_finders.cpp +++ b/projects/miopen/src/conv/solver_finders.cpp @@ -45,6 +45,9 @@ MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_COMPILE_ONLY) MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_FIND_CONV_INSUFFICIENT_WORKSPACE_ALLOW_FINDDB_UPDATE) +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_SEARCH_CUTOFF, false) +MIOPEN_DECLARE_ENV_VAR_UINT64(MIOPEN_SEARCH_SKIP_PCT, 130) + namespace miopen { namespace conv { @@ -202,11 +205,11 @@ const std::vector>& GetConvSolverFinders() { static const auto finders = []() { auto tmp = std::vector>{}; - tmp.emplace_back(std::make_unique()); - tmp.emplace_back(std::make_unique()); tmp.emplace_back(std::make_unique()); tmp.emplace_back(std::make_unique()); + tmp.emplace_back(std::make_unique()); tmp.emplace_back(std::make_unique()); + tmp.emplace_back(std::make_unique()); return tmp; }(); @@ -221,17 +224,18 @@ std::vector EvaluateInvokers(const Handle& handle, const AlgorithmName& algorithm_name, const NetworkConfig& network_config, const AnyInvokeParams& invoke_ctx, - bool& is_result_optimal, + FindCoreResult& core_result, bool force_attach_binary) { const auto arch = env::value(MIOPEN_DEVICE_ARCH); if(!arch.empty()) return {}; - auto selected = miopen::solver::ConvSolution{miopenStatusUnknownError}; - auto best = std::numeric_limits::max(); - auto best_invoker = Invoker{}; - auto ret = std::vector{}; + bool using_search_cutoff = env::value(MIOPEN_SEARCH_CUTOFF); + auto selected = miopen::solver::ConvSolution{miopenStatusUnknownError}; + auto best = std::numeric_limits::max(); + auto best_invoker = Invoker{}; + auto ret = std::vector{}; std::vector samples; for(const auto& sol : solutions) @@ -250,13 +254,28 @@ std::vector EvaluateInvokers(const Handle& handle, // That is why we do not write sub-optimal results into persistent find-db (on disk) // unless this is explicitly enabled via environment setting. if(!env::enabled(MIOPEN_FIND_CONV_INSUFFICIENT_WORKSPACE_ALLOW_FINDDB_UPDATE)) - is_result_optimal = false; + core_result.is_optimal = false; continue; } if(!sol.invoker_factory) MIOPEN_THROW("Invoker is not provided by solver " + sol.solver_id); + float skip_time = core_result.find_search_best_time; + if(skip_time < std::numeric_limits::max()) + { + // skip Naive if another solver has been timed and solution took more than 50ns. + if(using_search_cutoff && sol.solver_id.find("Naive") != std::string::npos && + skip_time > 0.05f) + { + MIOPEN_LOG_I("Skipping Naive Solver: " << algorithm_name.ToString() << ":" + << sol.solver_id); + continue; + } + skip_time *= env::value(MIOPEN_SEARCH_SKIP_PCT) / 100.0f; + } + MIOPEN_LOG_I("Evaluating Solver: " << algorithm_name.ToString() << ":" << sol.solver_id); + std::vector programs; const auto invoker = handle.PrepareInvoker(*sol.invoker_factory, sol.construction_params, @@ -272,6 +291,7 @@ std::vector EvaluateInvokers(const Handle& handle, auto first_elapsed = static_cast(0); int i = 0; samples.clear(); + while(i < N_RUNS_MAX && elapsed < TIME_MS_MAX) { invoker(handle, invoke_ctx); @@ -280,6 +300,12 @@ std::vector EvaluateInvokers(const Handle& handle, if(i > 0) { samples.push_back(handle.GetKernelTime()); + if(i == 1 && using_search_cutoff && samples.front() > skip_time) + { + MIOPEN_LOG_I("Skipping (Slow) Solver: " << algorithm_name.ToString() << ":" + << sol.solver_id); + break; + } } else { @@ -305,9 +331,10 @@ std::vector EvaluateInvokers(const Handle& handle, MIOPEN_LOG_I(sol << ": " << elapsed << (elapsed < best ? " < " : " >= ") << best); if(elapsed < best) { - best = elapsed; - selected = sol; - best_invoker = invoker; + best = elapsed; + selected = sol; + best_invoker = invoker; + core_result.find_search_best_time = best; } auto solution = Solution{solver::Id{sol.solver_id}, elapsed, sol.workspace_sz}; @@ -344,7 +371,7 @@ FindCoreResult FindCore(const AnyInvokeParams& invoke_ctx, auto& handle = ctx.GetStream(); // Find - auto solutions = std::map>{}; + auto solutions = std::vector>>{}; std::transform( finders.begin(), finders.end(), std::inserter(solutions, solutions.end()), [&](auto&& f) { return std::make_pair(f->GetAlgorithmName(problem), @@ -392,13 +419,8 @@ FindCoreResult FindCore(const AnyInvokeParams& invoke_ctx, for(const auto& ss : solutions) { - auto evaluated = EvaluateInvokers(handle, - ss.second, - ss.first, - network_config, - invoke_ctx, - ret.is_optimal, - force_attach_binary); + auto evaluated = EvaluateInvokers( + handle, ss.second, ss.first, network_config, invoke_ctx, ret, force_attach_binary); ret.solutions.insert(ret.solutions.end(), std::make_move_iterator(evaluated.begin()), diff --git a/projects/miopen/src/include/miopen/any_solver.hpp b/projects/miopen/src/include/miopen/any_solver.hpp index 18cdeba3bda..5de823cc133 100644 --- a/projects/miopen/src/include/miopen/any_solver.hpp +++ b/projects/miopen/src/include/miopen/any_solver.hpp @@ -132,10 +132,10 @@ struct AnySolver std::string GenericSearch(const ExecutionContext& ctx, const miopen::conv::ProblemDescription& problem, const miopen::AnyInvokeParams& invoke_ctx, - std::vector* perf_sols = nullptr) const + std::vector* perf_solsp = nullptr) const { assert(ptr_value != nullptr); - return ptr_value->GenericSearch(ctx, problem, invoke_ctx, perf_sols); + return ptr_value->GenericSearch(ctx, problem, invoke_ctx, perf_solsp); } InvokerFactory GetInvokeFactory(const ExecutionContext& ctx, @@ -213,7 +213,7 @@ struct AnySolver GenericSearch(const ExecutionContext& ctx, const miopen::conv::ProblemDescription& problem, const miopen::AnyInvokeParams& invoke_ctx, - std::vector* perf_sols) const = 0; + std::vector* perf_solsp) const = 0; virtual InvokerFactory GetInvokeFactory(const ExecutionContext& ctx, const miopen::conv::ProblemDescription& problem, const std::string& perf_cfg) const = 0; @@ -457,11 +457,12 @@ struct AnySolver std::string GenericSearch(const ExecutionContext& ctx, const miopen::conv::ProblemDescription& problem, const miopen::AnyInvokeParams& invoke_ctx, - std::vector* perf_sols, + std::vector* perf_solsp, std::true_type, std::false_type) const { - auto config = miopen::solver::GenericSearch(value, ctx, problem, invoke_ctx, perf_sols); + auto config = + miopen::solver::GenericSearch(value, ctx, problem, invoke_ctx, perf_solsp); return config.ToString(); } @@ -490,12 +491,12 @@ struct AnySolver GenericSearch(const ExecutionContext& ctx, const miopen::conv::ProblemDescription& problem, const miopen::AnyInvokeParams& invoke_ctx, - std::vector* perf_sols) const override + std::vector* perf_solsp) const override { return GenericSearch(ctx, problem, invoke_ctx, - perf_sols, + perf_solsp, std::integral_constant(), std::integral_constant()); } diff --git a/projects/miopen/src/include/miopen/conv/solver_finders.hpp b/projects/miopen/src/include/miopen/conv/solver_finders.hpp index 90fda1a238b..2e598afef4a 100644 --- a/projects/miopen/src/include/miopen/conv/solver_finders.hpp +++ b/projects/miopen/src/include/miopen/conv/solver_finders.hpp @@ -160,6 +160,7 @@ const std::vector>& GetConvSolverFinders(); struct FindCoreResult { std::vector solutions; + float find_search_best_time = std::numeric_limits::max(); bool is_optimal; }; @@ -168,7 +169,7 @@ std::vector EvaluateInvokers(const Handle& handle, const AlgorithmName& algorithm_name, const NetworkConfig& network_config, const AnyInvokeParams& invoke_ctx, - bool& is_result_optimal, + FindCoreResult& core_result, bool force_attach_binary); FindCoreResult FindCore(const AnyInvokeParams& invoke_ctx, diff --git a/projects/miopen/src/include/miopen/execution_context.hpp b/projects/miopen/src/include/miopen/execution_context.hpp index 81576c363e2..9885eaab6d0 100644 --- a/projects/miopen/src/include/miopen/execution_context.hpp +++ b/projects/miopen/src/include/miopen/execution_context.hpp @@ -94,9 +94,11 @@ struct MIOPEN_INTERNALS_EXPORT ExecutionContext // to optimize the getWorkspaceSize() calls for speed. This specific optimization is correct // because Solvers shall be written so that the required workspace size does not depend on the // performance config. - bool disable_perfdb_access = false; - bool use_dynamic_solutions_only = false; - bool is_for_generic_search = false; + bool disable_perfdb_access = false; + bool use_dynamic_solutions_only = false; + bool is_for_generic_search = false; + mutable float generic_search_worst_time = std::numeric_limits::max(); + mutable float generic_search_best_time = std::numeric_limits::max(); inline const Handle& GetStream() const { return *stream; } inline void SetStream(const Handle* stream_) { stream = stream_; } diff --git a/projects/miopen/src/include/miopen/find_solution.hpp b/projects/miopen/src/include/miopen/find_solution.hpp index f6429d1123f..df794a03a8a 100644 --- a/projects/miopen/src/include/miopen/find_solution.hpp +++ b/projects/miopen/src/include/miopen/find_solution.hpp @@ -142,11 +142,11 @@ auto FindSolutionImpl(rank<1>, { auto record = DbRecord(DbKinds::PerfDb, problem); if(env::enabled(MIOPEN_WARN_SEARCH)) - MIOPEN_LOG_W("Search Start: " << record.GetKey() << " : " << s.SolverDbId() - << ", enforce: " << enforce); + MIOPEN_LOG_W("Search Started: " << record.GetKey() << " : " << s.SolverDbId() + << ", enforce: " << enforce); else - MIOPEN_LOG_I("Search Start: " << record.GetKey() << " : " << s.SolverDbId() - << ", enforce: " << enforce); + MIOPEN_LOG_I("Search Started: " << record.GetKey() << " : " << s.SolverDbId() + << ", enforce: " << enforce); try { auto c = s.Search(context, problem, invoke_ctx); diff --git a/projects/miopen/src/include/miopen/generic_search.hpp b/projects/miopen/src/include/miopen/generic_search.hpp index 6638f22ca0b..182472693de 100644 --- a/projects/miopen/src/include/miopen/generic_search.hpp +++ b/projects/miopen/src/include/miopen/generic_search.hpp @@ -426,7 +426,7 @@ auto GenericSearch(const Solver s, const Context& context_, const Problem& problem, const AnyInvokeParams& invoke_ctx_, - std::vector* perf_sols = nullptr) + std::vector* perf_solsp = nullptr) -> decltype(s.GetDefaultPerformanceConfig(context_, problem)) { auto context = context_; @@ -434,6 +434,7 @@ auto GenericSearch(const Solver s, using PerformanceConfig = decltype(s.GetDefaultPerformanceConfig(context, problem)); PerformanceConfig best_config; + PerformanceConfig last_config; // Used in cases where all kernels were intentionally skipped const auto default_solution = s.GetSolution(context, problem, s.GetDefaultPerformanceConfig(context, problem)); const auto invoke_ctx = [invoke_ctx_]() { @@ -443,10 +444,7 @@ auto GenericSearch(const Solver s, }(); // list of sampled solutions - if(perf_sols) - { - perf_sols->erase(perf_sols->begin(), perf_sols->end()); - } + std::vector perf_sols; auto& profile_h = context.GetStream(); const AutoEnableProfiling enableProfiling{profile_h}; @@ -485,6 +483,19 @@ auto GenericSearch(const Solver s, float worst_time = std::numeric_limits::max(); size_t n_failed = 0; size_t n_best = 0; + // enable early search termination + bool using_search_cutoff = env::value(MIOPEN_SEARCH_CUTOFF); + // terminate search when perf is less than cutoff + float cutoff_time = context.generic_search_worst_time; + if(cutoff_time < std::numeric_limits::max()) + cutoff_time *= env::value(MIOPEN_SEARCH_CUTOFF_MUL); + // skip detailed measurement for configs slower than skip_time + float skip_time = context.generic_search_best_time; + if(skip_time < std::numeric_limits::max()) + skip_time *= env::value(MIOPEN_SEARCH_SKIP_PCT) / 100.0f; + + bool rec_results = perf_solsp || using_search_cutoff; + HeartBeat heartbeat; heartbeat.Start(); @@ -530,6 +541,8 @@ auto GenericSearch(const Solver s, auto current_config = std::get<0>(kinder); auto current_solution = std::get<1>(kinder); + last_config = current_config; + if(std::get<2>(kinder)) { threads_remaining--; @@ -583,13 +596,23 @@ auto GenericSearch(const Solver s, if(ret == 0) { + // If config is worse than the cutoff time abort the search + if(elapsed_time > cutoff_time) + { + MIOPEN_LOG_I2("Ending Search, measured time: " + << elapsed_time << " was greater than cutoff: " << cutoff_time); + for(const auto& kernelInfo : current_solution.construction_params) + profile_h.ClearProgram(kernelInfo.kernel_file, kernelInfo.comp_options); + break; + } + // Smooth the jitter of measurements: // If the 1st probe is NOT too bad (measured time <= 1.10 * worst sample of the best // config), then gather 9 more samples, and remove positive z-score outliers. Use // the mean value with outliers removed for calculating best config. constexpr int N_RUNS = 10; last_imprv++; - if(elapsed_time / worst_time < 1.10f) + if(elapsed_time < worst_time * 1.10f && elapsed_time < skip_time) { MIOPEN_LOG_I2("Finding average for: " << elapsed_time << " / " << best_time << " = " << (elapsed_time / best_time)); @@ -636,10 +659,8 @@ auto GenericSearch(const Solver s, } } } - if(perf_sols) - { - perf_sols->push_back({current_config.ToString(), elapsed_time}); - } + if(rec_results) + perf_sols.push_back({current_config.ToString(), elapsed_time}); } // Banchmarked kernels will not be used anymore. @@ -676,13 +697,32 @@ auto GenericSearch(const Solver s, MIOPEN_LOG_I("Done: " << n_runs_total << '/' << n_failed << '/' << n_runs_total << ", best #" << n_best << ' ' << best_time << ' ' << best_config); + // If no errors were encountered, but we either cutoff or skipped every kernel, don't throw. + if(!is_passed && n_failed == 0) + { + MIOPEN_LOG_I( + "Search cutoff or skipped for all kernels. Last config returned: " << last_config); + return last_config; + } + if(!is_passed) MIOPEN_THROW("Search failed"); - if(perf_sols) - std::sort(perf_sols->begin(), perf_sols->end(), [](SolutionPerf a, SolutionPerf b) { - return a.time < b.time; - }); + std::sort(perf_sols.begin(), perf_sols.end(), [](SolutionPerf a, SolutionPerf b) { + return a.time < b.time; + }); + + // if using cutoff for search update timing + if(using_search_cutoff == true && best_time < context.generic_search_best_time) + { + float new_worst = (perf_sols.end() - 1)->time; + context_.generic_search_best_time = best_time; + context_.generic_search_worst_time = new_worst; + MIOPEN_LOG_I2("Times updated, best: " << best_time << " worst: " << new_worst); + } + + if(perf_solsp) + *perf_solsp = std::move(perf_sols); // Run once with the default config and show score. const auto& invoker = profile_h.PrepareInvoker(*default_solution.invoker_factory, diff --git a/projects/miopen/src/include/miopen/generic_search_controls.hpp b/projects/miopen/src/include/miopen/generic_search_controls.hpp index 10a1b1b1e97..a515f27fd3e 100644 --- a/projects/miopen/src/include/miopen/generic_search_controls.hpp +++ b/projects/miopen/src/include/miopen/generic_search_controls.hpp @@ -52,3 +52,9 @@ MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_COMPILE_ONLY) const size_t MIOPEN_DEFAULT_VERIFY_TOLERANCE_PCT = 15; MIOPEN_DECLARE_ENV_VAR_UINT64(MIOPEN_VERIFY_TOLERANCE_PCT, MIOPEN_DEFAULT_VERIFY_TOLERANCE_PCT) + +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_SEARCH_CUTOFF, false) + +MIOPEN_DECLARE_ENV_VAR_UINT64(MIOPEN_SEARCH_CUTOFF_MUL, 10) + +MIOPEN_DECLARE_ENV_VAR_UINT64(MIOPEN_SEARCH_SKIP_PCT, 130) diff --git a/projects/miopen/src/ocl/convolutionocl.cpp b/projects/miopen/src/ocl/convolutionocl.cpp index 6b69c9f08cb..2fcbe3b6c8f 100644 --- a/projects/miopen/src/ocl/convolutionocl.cpp +++ b/projects/miopen/src/ocl/convolutionocl.cpp @@ -287,7 +287,8 @@ std::vector EvaluateConvSolutions(const ExecutionContext& ctx, // test timing of solver reported by system db const auto& handle = ctx.GetStream(); AutoEnableProfiling enableProfiling{handle}; - bool is_optimal = true; + FindCoreResult core_result; + core_result.is_optimal = true; // reverse solutions so that EvaluateInvokers registers the fastest solution last auto sol_itr = solutions.rbegin(); @@ -307,7 +308,7 @@ std::vector EvaluateConvSolutions(const ExecutionContext& ctx, AlgorithmName algo{ ConvolutionAlgoToDirectionalString(id.GetAlgo(), problem.GetDirection())}; std::vector eval_sol = EvaluateInvokers( - handle, conv_sols, algo, problem.MakeNetworkConfig(), invoke_ctx, is_optimal, false); + handle, conv_sols, algo, problem.MakeNetworkConfig(), invoke_ctx, core_result, false); if(!eval_sol.empty()) eval_sols.emplace_back(eval_sol.front()); @@ -376,16 +377,16 @@ std::vector VerifiedFDBSolution(const ExecutionContext& ctx, // system db result is good // add to user fdb so this check is skipped next time MIOPEN_LOG_I2("TrustVerify: Add system db entry to user db"); - auto fallback = FallbackPath(); - auto ret = FindCoreResult(); - ret.is_optimal = true; - auto copy_sols = conv.GetSolutions(ctx, problem, 4, &fallback, &invoke_ctx); + auto fallback = FallbackPath(); + auto core_result = FindCoreResult(); + core_result.is_optimal = true; + auto copy_sols = conv.GetSolutions(ctx, problem, 4, &fallback, &invoke_ctx); for(const auto& s : copy_sols) { auto solution = Solution{solver::Id{s.solution_id}, s.time, s.workspace_size}; - ret.solutions.emplace_back(std::move(solution)); + core_result.solutions.emplace_back(std::move(solution)); } - return ret; + return core_result; } else { @@ -413,6 +414,9 @@ std::vector VerifiedFDBSolution(const ExecutionContext& ctx, else MIOPEN_LOG_I("Find Ended: " << record.GetKey()); + ctx.generic_search_worst_time = ctx_copy.generic_search_worst_time; + ctx.generic_search_best_time = ctx_copy.generic_search_best_time; + return ret; } }); @@ -520,6 +524,9 @@ std::vector FindConvolution(const ExecutionContext& ctx, else MIOPEN_LOG_I("Find Ended: " << record.GetKey()); + ctx.generic_search_worst_time = ctx_copy.generic_search_worst_time; + ctx.generic_search_best_time = ctx_copy.generic_search_best_time; + return ret; }); }