From a99f8bab97b4ce4ddc0aba472a22470e620a9572 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Wed, 11 Sep 2024 08:23:36 -0700 Subject: [PATCH 01/10] Fixing bug in kokkos counter name generation --- src/apex/apex_kokkos.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/apex/apex_kokkos.cpp b/src/apex/apex_kokkos.cpp index 71882b5c..df6ae085 100644 --- a/src/apex/apex_kokkos.cpp +++ b/src/apex/apex_kokkos.cpp @@ -311,11 +311,11 @@ void kokkosp_allocate_data(SpaceHandle_t handle, const char* name, APEX_UNUSED(ptr); std::stringstream ss; ss << "Kokkos " << handle.name << " data, " << name; + ss << ": Bytes"; std::string tmp2{ss.str()}; memory_mtx.lock(); memory_map().insert(std::pair(ptr, tmp2)); memory_mtx.unlock(); - ss << ": Bytes"; double bytes = (double)(size); if (apex::apex_options::use_kokkos_counters()) { apex::sample_value(tmp2, bytes); From 85b694d81137f7edc96d2cb5b85ee932fa32845e Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Wed, 11 Sep 2024 08:23:57 -0700 Subject: [PATCH 02/10] Adding --apex:kokkos-counters flag --- src/scripts/apex_exec | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/scripts/apex_exec b/src/scripts/apex_exec index 1b3e35b7..40ac2ed9 100755 --- a/src/scripts/apex_exec +++ b/src/scripts/apex_exec @@ -62,6 +62,7 @@ where APEX options are zero or more of: --apex:kokkos enable Kokkos support --apex:kokkos-tuning enable Kokkos runtime autotuning support --apex:kokkos-fence enable Kokkos fences for async kernels + --apex:kokkos-counters enable Kokkos counters for allocations and data transfers --apex:raja enable RAJA support --apex:pthread enable pthread wrapper support (default: off) --apex:track-pthread track pthread lifetime (forces --apex:pthread on) @@ -106,6 +107,7 @@ openacc=no kokkos=no kokkos_tuning=no kokkos_fence=no +kokkos_counters=no raja=no otf2=no gtrace=no @@ -175,6 +177,11 @@ while (( "$#" )); do kokkos_fence=yes shift ;; + --apex:kokkos-counters) + kokkos=yes + kokkos_counters=yes + shift + ;; --apex:kokkos_fence|--apex:kokkos-fence) kokkos=yes kokkos_fence=yes @@ -749,6 +756,9 @@ fi if [ ${kokkos_tuning} = yes ] ; then export APEX_KOKKOS_TUNING=1 fi +if [ ${kokkos_counters} = yes ] ; then + export APEX_KOKKOS_COUNTERS=1 +fi if [ ${raja} = yes ] ; then export RAJA_PLUGINS=${BASEDIR}/${LIBDIR}/${APEX_LIBRARY_NAME}${SHLIBX} fi From 230e887bf939b9b54301265903cbd3295d7544a2 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Wed, 11 Sep 2024 12:05:17 -0700 Subject: [PATCH 03/10] Cleaning up usage message --- src/scripts/apex_exec | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/scripts/apex_exec b/src/scripts/apex_exec index 40ac2ed9..d624fcbd 100755 --- a/src/scripts/apex_exec +++ b/src/scripts/apex_exec @@ -54,8 +54,8 @@ where APEX options are zero or more of: --apex:otf2path specify location of OTF2 archive (default: ./OTF2_archive) --apex:otf2name specify name of OTF2 file (default: APEX) - --apex:gtrace enable Google Trace Events output (deprecated) - --apex:pftrace enable Perfetto Trace output + --apex:gtrace enable Google Trace Events output + --apex:pftrace enable Perfetto Trace output (if enabled...gtrace is a better option) --apex:scatter enable scatterplot output (python required for post-processing) --apex:openacc enable OpenACC support From 3ceaea8e54f1d8b8bdebc711f15e3198ed9a6442 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Mon, 16 Sep 2024 15:07:44 -0700 Subject: [PATCH 04/10] Adding native nelder-mead search. Due to ongoing technical difficulties with the Active Harmony integration, we've implemented our own nelder mead search. It still needs some debugging but appears to be working for discretized input sets. Still needs refactoring to handle continuous values from the Kokkos tuning interface. That's the next step. --- CMakeLists.txt | 4 + src/apex/CMakeLists_hpx.cmake | 4 + src/apex/CMakeLists_standalone.cmake | 1 + src/apex/apex_kokkos_tuning.cpp | 3 + src/apex/apex_policies.cpp | 107 ++++ src/apex/apex_policies.hpp | 16 +- src/apex/nelder_mead.cpp | 95 ++++ src/apex/nelder_mead.hpp | 169 ++++++ src/apex/nelder_mead_internal.h | 559 ++++++++++++++++++++ src/unit_tests/Kokkos/CMakeLists.txt | 1 + src/unit_tests/Kokkos/mm2d_tiling.cpp | 325 ++++++++++++ src/unit_tests/Kokkos/tuning_playground.hpp | 27 +- 12 files changed, 1301 insertions(+), 10 deletions(-) create mode 100644 src/apex/nelder_mead.cpp create mode 100644 src/apex/nelder_mead.hpp create mode 100644 src/apex/nelder_mead_internal.h create mode 100644 src/unit_tests/Kokkos/mm2d_tiling.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index e5f984f5..64f76ccd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -688,6 +688,10 @@ if(APEX_WITH_KOKKOS) if(APEX_BUILD_TESTS) # Just for testing SET(Kokkos_LIBRARY kokkoscore) + set(Kokkos_ENABLE_OPENMP ON CACHE BOOL "" FORCE) + set(Kokkos_ENABLE_SERIAL ON CACHE BOOL "" FORCE) + set(Kokkos_ARCH_NATIVE ON CACHE BOOL "" FORCE) + set(Kokkos_ENABLE_TUNING ON CACHE BOOL "" FORCE) add_subdirectory(kokkos) endif(APEX_BUILD_TESTS) endif() diff --git a/src/apex/CMakeLists_hpx.cmake b/src/apex/CMakeLists_hpx.cmake index 20a9ae12..20b044bb 100644 --- a/src/apex/CMakeLists_hpx.cmake +++ b/src/apex/CMakeLists_hpx.cmake @@ -322,9 +322,11 @@ set(apex_headers dependency_tree.hpp event_listener.hpp exhaustive.hpp + genetic_search.hpp gzstream.hpp handler.hpp memory_wrapper.hpp + nelder_mead.hpp policy_handler.hpp profile.hpp profiler.hpp @@ -361,9 +363,11 @@ set(apex_sources event_listener.cpp event_filter.cpp exhaustive.cpp + genetic_search.cpp gzstream.cpp handler.cpp memory_wrapper.cpp + nelder_mead.cpp nvtx_listener.cpp policy_handler.cpp profile_reducer.cpp diff --git a/src/apex/CMakeLists_standalone.cmake b/src/apex/CMakeLists_standalone.cmake index 46e5d900..20285b56 100644 --- a/src/apex/CMakeLists_standalone.cmake +++ b/src/apex/CMakeLists_standalone.cmake @@ -90,6 +90,7 @@ exhaustive.cpp genetic_search.cpp handler.cpp memory_wrapper.cpp +nelder_mead.cpp nvtx_listener.cpp ${OTF2_SOURCE} ${perfetto_sources} diff --git a/src/apex/apex_kokkos_tuning.cpp b/src/apex/apex_kokkos_tuning.cpp index 2166515c..f138f6e5 100644 --- a/src/apex/apex_kokkos_tuning.cpp +++ b/src/apex/apex_kokkos_tuning.cpp @@ -270,6 +270,9 @@ class KokkosSession { } else if (strncmp(apex::apex_options::kokkos_tuning_policy(), "genetic_search", strlen("genetic_search")) == 0) { strategy = apex_ah_tuning_strategy::GENETIC_SEARCH; + } else if (strncmp(apex::apex_options::kokkos_tuning_policy(), + "nelder_mead", strlen("nelder_mead")) == 0) { + strategy = apex_ah_tuning_strategy::NELDER_MEAD_INTERNAL; } else { strategy = apex_ah_tuning_strategy::NELDER_MEAD; } diff --git a/src/apex/apex_policies.cpp b/src/apex/apex_policies.cpp index df4388d8..284ea7f5 100644 --- a/src/apex/apex_policies.cpp +++ b/src/apex/apex_policies.cpp @@ -914,6 +914,42 @@ int apex_sa_policy(shared_ptr tuning_session, return APEX_NOERROR; } +int apex_nelder_mead_policy(shared_ptr tuning_session, + apex_context const context) { + APEX_UNUSED(context); + if (apex_final) return APEX_NOERROR; // we terminated + std::unique_lock l{shutdown_mutex}; + /* If we are doing nested search contexts, allow us to keep searching + * on outer contexts until all inner contexts have converged! */ + bool force{true}; + if (context.data != nullptr) { + // the context data is a pointer to a boolean value + force = *((bool*)(context.data)); + } + if (tuning_session->nelder_mead_session.converged() && force) { + if (!tuning_session->converged_message) { + tuning_session->converged_message = true; + cout << "APEX: Tuning has converged for session " << tuning_session->id + << "." << endl; + tuning_session->nelder_mead_session.saveBestSettings(); + tuning_session->nelder_mead_session.printBestSettings(); + } + tuning_session->nelder_mead_session.saveBestSettings(); + return APEX_NOERROR; + } + + // get a measurement of our current setting + double new_value = tuning_session->metric_of_interest(); + + /* Report the performance we've just measured. */ + tuning_session->nelder_mead_session.evaluate(new_value); + + /* Request new settings for next time */ + tuning_session->nelder_mead_session.getNewSettings(); + + return APEX_NOERROR; +} + int apex_genetic_policy(shared_ptr tuning_session, apex_context const context) { APEX_UNUSED(context); @@ -1572,6 +1608,67 @@ inline int __sa_setup(shared_ptr return APEX_NOERROR; } +inline int __nelder_mead_setup(shared_ptr + tuning_session, apex_tuning_request & request) { + APEX_UNUSED(tuning_session); + // set up the Simulated annealing! + // iterate over the parameters, and create variables. + using namespace apex::nelder_mead; + for(auto & kv : request.params) { + auto & param = kv.second; + const char * param_name = param->get_name().c_str(); + switch(param->get_type()) { + case apex_param_type::LONG: { + auto param_long = + std::static_pointer_cast(param); + Variable v(VariableType::longtype, param_long->value.get()); + long lvalue = param_long->min; + do { + v.lvalues.push_back(lvalue); + lvalue = lvalue + param_long->step; + } while (lvalue <= param_long->max); + v.set_init(); + tuning_session->nelder_mead_session.add_var(param_name, std::move(v)); + } + break; + case apex_param_type::DOUBLE: { + auto param_double = + std::static_pointer_cast(param); + Variable v(VariableType::doubletype, param_double->value.get()); + double dvalue = param_double->min; + do { + v.dvalues.push_back(dvalue); + dvalue = dvalue + param_double->step; + } while (dvalue <= param_double->max); + v.set_init(); + tuning_session->nelder_mead_session.add_var(param_name, std::move(v)); + } + break; + case apex_param_type::ENUM: { + auto param_enum = + std::static_pointer_cast(param); + Variable v(VariableType::stringtype, param_enum->value.get()); + for(const std::string & possible_value : + param_enum->possible_values) { + v.svalues.push_back(possible_value); + } + v.set_init(); + tuning_session->nelder_mead_session.add_var(param_name, std::move(v)); + } + break; + default: + cerr << + "ERROR: Attempted to register tuning parameter with unknown type." + << endl; + return APEX_ERROR; + } + } + /* request initial settings */ + tuning_session->nelder_mead_session.getNewSettings(); + + return APEX_NOERROR; +} + inline int __genetic_setup(shared_ptr tuning_session, apex_tuning_request & request) { APEX_UNUSED(tuning_session); @@ -1848,6 +1945,16 @@ inline int __common_setup_custom_tuning(shared_ptr } ); } + } else if (request.strategy == apex_ah_tuning_strategy::NELDER_MEAD_INTERNAL) { + status = __nelder_mead_setup(tuning_session, request); + if(status == APEX_NOERROR) { + apex::register_policy( + request.trigger, + [=](apex_context const & context)->int { + return apex_nelder_mead_policy(tuning_session, context); + } + ); + } } else if (request.strategy == apex_ah_tuning_strategy::GENETIC_SEARCH) { status = __genetic_setup(tuning_session, request); if(status == APEX_NOERROR) { diff --git a/src/apex/apex_policies.hpp b/src/apex/apex_policies.hpp index da5eb63a..9a9bf4ef 100644 --- a/src/apex/apex_policies.hpp +++ b/src/apex/apex_policies.hpp @@ -35,10 +35,12 @@ #include "random.hpp" // include the genetic_search class #include "genetic_search.hpp" +// include the nelder_mead class +#include "nelder_mead.hpp" enum class apex_param_type : int {NONE, LONG, DOUBLE, ENUM}; enum class apex_ah_tuning_strategy : int { - EXHAUSTIVE, RANDOM, NELDER_MEAD, + EXHAUSTIVE, RANDOM, NELDER_MEAD, NELDER_MEAD_INTERNAL, PARALLEL_RANK_ORDER, SIMULATED_ANNEALING, APEX_EXHAUSTIVE, APEX_RANDOM, GENETIC_SEARCH}; @@ -82,6 +84,8 @@ class apex_param { tuning_session, apex_tuning_request & request); friend int __genetic_setup(std::shared_ptr tuning_session, apex_tuning_request & request); + friend int __nelder_mead_setup(std::shared_ptr + tuning_session, apex_tuning_request & request); }; class apex_param_long : public apex_param { @@ -121,6 +125,8 @@ class apex_param_long : public apex_param { tuning_session, apex_tuning_request & request); friend int __genetic_setup(std::shared_ptr tuning_session, apex_tuning_request & request); + friend int __nelder_mead_setup(std::shared_ptr + tuning_session, apex_tuning_request & request); }; class apex_param_double : public apex_param { @@ -160,6 +166,8 @@ class apex_param_double : public apex_param { tuning_session, apex_tuning_request & request); friend int __genetic_setup(std::shared_ptr tuning_session, apex_tuning_request & request); + friend int __nelder_mead_setup(std::shared_ptr + tuning_session, apex_tuning_request & request); }; class apex_param_enum : public apex_param { @@ -198,6 +206,8 @@ class apex_param_enum : public apex_param { tuning_session, apex_tuning_request & request); friend int __genetic_setup(std::shared_ptr tuning_session, apex_tuning_request & request); + friend int __nelder_mead_setup(std::shared_ptr + tuning_session, apex_tuning_request & request); }; @@ -340,6 +350,8 @@ class apex_tuning_request { tuning_session, apex_tuning_request & request); friend int __genetic_setup(std::shared_ptr tuning_session, apex_tuning_request & request); + friend int __nelder_mead_setup(std::shared_ptr + tuning_session, apex_tuning_request & request); }; @@ -368,6 +380,8 @@ struct apex_tuning_session { apex::random::Random random_session; // if using genetic, this is the request. apex::genetic::GeneticSearch genetic_session; + // if using nelder mead, this is the request. + apex::nelder_mead::NelderMead nelder_mead_session; bool converged_message = false; // variables related to power throttling diff --git a/src/apex/nelder_mead.cpp b/src/apex/nelder_mead.cpp new file mode 100644 index 00000000..43475adb --- /dev/null +++ b/src/apex/nelder_mead.cpp @@ -0,0 +1,95 @@ +#include "nelder_mead.hpp" +#include +#include +#include +#include +#include + +namespace apex { + +namespace nelder_mead { + +void NelderMead::start(void) { + // create a starting point + std::vector init_point; + for (auto& v : vars) { + init_point.push_back(v.second.get_init()); + } + // create a lower limit + std::vector lower_limit; + std::vector upper_limit; + for (auto& v : vars) { + auto& limits = v.second.get_limits(); + lower_limit.push_back(limits[0]); + upper_limit.push_back(limits[1]); + } + // create a starting simplex - random values in the space, nvars+1 of them + std::vector> init_simplex; + for (size_t i = 0 ; i < (vars.size() + 1) ; i++) { + std::vector tmp; + for (auto& v : vars) { + double r = ((double) std::rand() / (RAND_MAX)); + auto& limits = v.second.get_limits(); + tmp.push_back(r * ((limits[1] + limits[0]) / 2.0)); + } + init_simplex.push_back(tmp); + } + searcher = new apex::internal::nelder_mead::Searcher(init_point, init_simplex, lower_limit, upper_limit, true); + searcher->function_tolerance(100000); + searcher->point_tolerance(0.01); +} + +/* +static std::string vector_to_string(std::vector val) { + std::stringstream ss; + ss << "["; + for (size_t i = 0; i < val.size(); i++) { + ss << val[i]; + ss << (i == val.size() - 1 ? "]" : ","); + } + return ss.str(); +} +*/ + +void NelderMead::getNewSettings() { + if (searcher == nullptr) start(); + // get the next point from the simplex search + auto point = searcher->get_next_point(); + //std::cout << "Next point: " << vector_to_string(point) << std::endl; + size_t i{0}; + for (auto& v : vars) { + // if continuous, we just get the value + if (v.second.vtype == VariableType::continuous) { + v.second.current_value = point[i]; + } else { + // otherwise, we scale the value from [0:maxlen] to our discrete index + double tmp = point[i] * v.second.maxlen; + v.second.current_index = std::min((size_t)(std::trunc(tmp)),(v.second.maxlen-1)); + } + v.second.set_current_value(); + i++; + } +} + +void NelderMead::evaluate(double new_cost) { + // report the result + searcher->report(new_cost); + if (new_cost < cost) { + if (new_cost < best_cost) { + best_cost = new_cost; + std::cout << "New best! " << new_cost << " k: " << k << std::endl; + for (auto& v : vars) { v.second.save_best(); } + for (auto& v : vars) { std::cout << ", " << v.first << ": " << v.second.toString(); } + std::cout << std::endl; + } + cost = new_cost; + } + k++; + return; +} + +} // genetic + +} // apex + + diff --git a/src/apex/nelder_mead.hpp b/src/apex/nelder_mead.hpp new file mode 100644 index 00000000..d46df951 --- /dev/null +++ b/src/apex/nelder_mead.hpp @@ -0,0 +1,169 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "apex_types.h" +#include "apex_assert.h" +#include "nelder_mead_internal.h" + +namespace apex { + +namespace nelder_mead { + +enum class VariableType { doubletype, longtype, stringtype, continuous } ; + +class Variable { +public: + std::vector dvalues; + std::vector lvalues; + std::vector svalues; + std::array limits; + VariableType vtype; + size_t current_index; + size_t best_index; + double current_value; + double best_value; + void * value; // for the client to get the values + size_t maxlen; + Variable () = delete; + Variable (VariableType vtype, void * ptr) : vtype(vtype), current_index(0), + best_index(0), current_value(0), best_value(0), value(ptr), maxlen(0) { } + void set_current_value() { + if (vtype == VariableType::continuous) { + *((double*)(value)) = current_value; + } + else if (vtype == VariableType::doubletype) { + *((double*)(value)) = dvalues[current_index]; + } + else if (vtype == VariableType::longtype) { + *((long*)(value)) = lvalues[current_index]; + } + else { + *((const char**)(value)) = svalues[current_index].c_str(); + } + } + void save_best() { + if (vtype == VariableType::continuous) { + best_value = current_value; + } else { + best_index = current_index; + } + } + void set_init() { + maxlen = (std::max(std::max(dvalues.size(), + lvalues.size()), svalues.size())); + current_index = 0; + set_current_value(); + } + std::string getBest() { + if (vtype == VariableType::continuous) { + *((double*)(value)) = best_value; + return std::to_string(best_value); + } + else if (vtype == VariableType::doubletype) { + *((double*)(value)) = dvalues[best_index]; + return std::to_string(dvalues[best_index]); + } + else if (vtype == VariableType::longtype) { + *((long*)(value)) = lvalues[best_index]; + return std::to_string(lvalues[best_index]); + } + //else if (vtype == VariableType::stringtype) { + *((const char**)(value)) = svalues[best_index].c_str(); + return svalues[best_index]; + } + std::string toString() { + if (vtype == VariableType::continuous) { + return std::to_string(current_value); + } + if (vtype == VariableType::doubletype) { + return std::to_string(dvalues[current_index]); + } + else if (vtype == VariableType::longtype) { + return std::to_string(lvalues[current_index]); + } + //else if (vtype == VariableType::stringtype) { + return svalues[current_index]; + //} + } + double get_init(void) { + if (vtype == VariableType::continuous) { + // choose some point in the middle between the limits + return (dvalues[0] + dvalues[1]) / 2.0; + } + // otherwise, choose an index somewhere in the middle + //return ((double)maxlen / 2.0); + return 0.5; + } + const std::array& get_limits(void) { + // if our variable is continuous, we have been initialized with + // two values, the min and the max + if (vtype == VariableType::continuous) { + limits[0] = dvalues[0]; + limits[1] = dvalues[1]; + // if our variable is discrete, we will use the range from 0 to 1, + // and scale that value to the number of descrete values we have to get + // an index. + } else { + limits[0] = 0.0; + limits[1] = 1.0; + } + return limits; + } +}; + +class NelderMead { +private: + double cost; + double best_cost; + size_t kmax; + size_t k; + std::map vars; + const size_t max_iterations{500}; + const size_t min_iterations{16}; + internal::nelder_mead::Searcher* searcher; +public: + void evaluate(double new_cost); + NelderMead() : + kmax(0), k(1), searcher(nullptr) { + cost = std::numeric_limits::max(); + best_cost = cost; + } + ~NelderMead() { + if (searcher != nullptr) { + delete searcher; + } + } + bool converged() { + return searcher->converged(); + } + void getNewSettings(); + void saveBestSettings() { + for (auto& v : vars) { v.second.getBest(); } + } + void printBestSettings() { + std::string d("["); + for (auto v : vars) { + std::cout << d << v.second.getBest(); + d = ","; + } + std::cout << "]" << std::endl; + } + size_t get_max_iterations(); + std::map& get_vars() { return vars; } + void add_var(std::string name, Variable var) { + vars.insert(std::make_pair(name, var)); + } + void start(void); +}; + +} // nelder_mead + +} // apex diff --git a/src/apex/nelder_mead_internal.h b/src/apex/nelder_mead_internal.h new file mode 100644 index 00000000..f619bf93 --- /dev/null +++ b/src/apex/nelder_mead_internal.h @@ -0,0 +1,559 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace apex { +namespace internal { +namespace nelder_mead { + +template class Vec { + public: + Vec() {} + Vec(unsigned int n) : n(n) { + val.resize(n, 0); + } + Vec(std::initializer_list c) { + n = c.size(); + val.resize(n); + std::copy(c.begin().c.end(), val.begin()); + } + Vec(const Vec &lhs) { + val = lhs.val; + n = lhs.n; + } + Vec(const std::vector &lhs) { + val = lhs; + n = lhs.size(); + } + T operator()(unsigned int idx) const { + if (idx >= n) { + throw std::range_error("Element access out of range"); + } + return val[idx]; + } + T &operator()(unsigned int idx) { + if (idx >= n) { + throw std::range_error("Element access out of range"); + } + return val[idx]; + } + + Vec operator=(const Vec &rhs) { + val = rhs.val; + n = rhs.n; + return *this; + } + + Vec operator=(const std::vector &rhs) { + val = rhs; + n = rhs.size(); + return *this; + } + + Vec operator+(const Vec &rhs) const { + Vec lhs(n); + for (unsigned int i = 0; i < n; i++) { + lhs.val[i] = val[i] + rhs.val[i]; + } + return lhs; + } + Vec operator-(const Vec &rhs) const { + Vec lhs(n); + for (unsigned int i = 0; i < n; i++) { + lhs.val[i] = val[i] - rhs.val[i]; + } + return lhs; + } + + Vec operator/(T rhs) const { + Vec lhs(n); + for (unsigned int i = 0; i < n; i++) { + lhs.val[i] = val[i] / rhs; + } + return lhs; + } + Vec &operator+=(const Vec &rhs) { + if (n != rhs.n) + throw std::invalid_argument( + "The two vectors must have the same length"); + for (unsigned int i = 0; i < n; i++) { + val[i] += rhs.val[i]; + } + return *this; + } + + Vec &operator-=(const Vec &rhs) { + if (n != rhs.n) + throw std::invalid_argument( + "The two vectors must have the same length"); + for (unsigned int i = 0; i < n; i++) { + val[i] -= rhs.val[i]; + } + return *this; + } + + Vec &operator*=(T rhs) { + for (unsigned int i = 0; i < n; i++) { + val[i] *= rhs; + } + return *this; + } + + Vec &operator/=(T rhs) { + for (unsigned int i = 0; i < n; i++) { + val[i] /= rhs; + } + return *this; + } + unsigned int size() const { + return n; + } + unsigned int resize(unsigned int _n) { + val.resize(_n); + n = _n; + return n; + } + T length() const { + T ans = 0; + for (unsigned int i = 0; i < n; i++) { + ans += (val[i] * val[i]); + } + return std::sqrt(ans); + } + std::vector &vec() { + return val; + } + void enforce_min(std::vector limit) { + // don't exceed the limit! + for (unsigned int i = 0; i < n; i++) { + val[i] = std::max(val[i], limit[i]); + } + } + void enforce_max(std::vector limit) { + // don't exceed the limit! + for (unsigned int i = 0; i < n; i++) { + val[i] = std::min(val[i], limit[i]); + } + } + std::string to_string(void) { + std::stringstream ss; + ss << "["; + for (unsigned int i = 0; i < n; i++) { + ss << val[i]; + ss << (i == n - 1 ? "]" : ","); + } + return ss.str(); + } + friend Vec operator*(T a, const Vec &b) { + Vec c(b.size()); + for (unsigned int i = 0; i < b.size(); i++) { + c.val[i] = a * b.val[i]; + } + return c; + } + + private: + std::vector val; + unsigned int n; +}; + +template class Searcher { + unsigned int dimension; + bool adaptive; + T tol_fun; + T tol_x; + unsigned int max_iter; + unsigned int max_fun_evals; + T alpha; + T beta; + T gamma; + T delta; + std::vector> simplex; + unsigned int current_simplex_index; + std::vector> value_cache; + unsigned int biggest_idx; + unsigned int smallest_idx; + T biggest_val; + T second_biggest_val; + T smallest_val; + unsigned int niter; + std::vector res; + unsigned int func_evals_count; + std::vector minimum_limits; + std::vector maximum_limits; + Vec x_bar; // centroid point + Vec x_r; // reflection point + Vec x_e; // expansion point + Vec x_c; // contraction point + T reflection_val; + T expansion_val; + T contraction_val; + typedef enum { + SIMPLEX, // evaluating the simplex + REFLECTION, // evaluate the reflection + EXPANSION, // evaluate the expansion + CONTRACTION // evaluate the contraction + } Stage_t; + Stage_t stage; + bool _converged; + bool outside; + + public: + bool verbose; + Searcher(const std::vector &initial_point, + const std::vector> &initial_simplex = {}, + const std::vector &_minimum_limits = {}, + const std::vector &_maximum_limits = {}, bool _adaptive = true) + : adaptive(_adaptive), tol_fun(1e-8), tol_x(1e-8), max_iter(100000), + max_fun_evals(100000), current_simplex_index(0), + minimum_limits(_minimum_limits), maximum_limits(_maximum_limits), + _converged(false), verbose(false) { + initialize(initial_point, initial_simplex); + } + void function_tolerance(T tol) { + tol_fun = tol; + } + void point_tolerance(T tol) { + tol_x = tol; + } + void initialize(const std::vector &initial_point, + const std::vector> &initial_simplex) { + dimension = initial_point.size(); + assert(dimension > 0); + // Setting parameters + if (adaptive) { + // Using the results of doi:10.1007/s10589-010-9329-3 + alpha = 1; + beta = 1 + 2 / dimension; + gamma = 0.75 - 1 / (2 * dimension); + delta = 1 - 1 / dimension; + } else { + alpha = 1; + beta = 2; + gamma = 0.5; + delta = 0.5; + } + std::cout << alpha << " " << beta << " " << gamma << " " << delta + << std::endl; + simplex.resize(dimension + 1); + if (initial_simplex.empty()) { + // Generate initial simplex + simplex[0] = initial_point; + for (unsigned int i = 1; i <= dimension; i++) { + Vec p(initial_point); + T tau = (p(i - 1) < 1e-6 and p(i - 1) > -1e-6) ? 0.00025 : 0.05; + p(i - 1) += tau; + // enforce limits! + if (minimum_limits.size() == dimension) + p.enforce_min(minimum_limits); + if (maximum_limits.size() == dimension) + p.enforce_max(maximum_limits); + simplex[i] = p; + } + } else { + for (size_t i = 0 ; i < initial_simplex.size() ; i++) { + simplex[i] = initial_simplex[i]; + } + } + value_cache.resize(dimension + 1); + for (auto &v : value_cache) { + v.first = false; + } + biggest_idx = 0; + smallest_idx = 0; + niter = 0; + func_evals_count = 0; + stage = SIMPLEX; + } + std::vector &get_res(void) { + return res; + } + const std::vector &get_next_point(void) { + if (converged()) + return res; + switch (stage) { + case SIMPLEX: { + // return the next simplex point + return simplex[current_simplex_index].vec(); + break; + } + case REFLECTION: { + // Calculate the reflection point + x_r = x_bar + alpha * (x_bar - simplex[biggest_idx]); + // enforce limits! + if (minimum_limits.size() == dimension) + x_r.enforce_min(minimum_limits); + if (maximum_limits.size() == dimension) + x_r.enforce_max(maximum_limits); + // return the reflection point + return x_r.vec(); + break; + } + case EXPANSION: { + // Calculate the Expansion point + x_e = x_bar + beta * (x_r - x_bar); + // enforce limits! + if (minimum_limits.size() == dimension) + x_e.enforce_min(minimum_limits); + if (maximum_limits.size() == dimension) + x_e.enforce_max(maximum_limits); + // return the expansion point + return x_e.vec(); + break; + } + case CONTRACTION: { + // Compute the contraction point + outside = false; + // is the reflection better than the known worst? + if (reflection_val < biggest_val) { + // Outside contraction + outside = true; + x_c = x_bar + gamma * (x_r - x_bar); + // is the reflection worse than the known worst? + } else if (reflection_val >= biggest_val) { + // Inside contraction + x_c = x_bar - gamma * (x_r - x_bar); + } + // return the contraction point + return x_c.vec(); + break; + } + default: { + assert(false); + } + } + return res; + } + void report(T val) { + switch (stage) { + case SIMPLEX: + evaluate_simplex(val); + break; + case REFLECTION: // evaluate the reflection + // assign the value to the current reflection point + reflection_val = val; + evaluate_reflection(); + break; + case EXPANSION: // evaluate the expansion + // assign the value to the current expansion point + expansion_val = val; + evaluate_expansion(); + break; + case CONTRACTION: // evaluate the contraction + // assign the value to the current contraction point + contraction_val = val; + evaluate_contraction(); + break; + default: + assert(false); + break; + } + func_evals_count++; + return; + } + bool converged(void) { + return _converged; + } + void evaluate_simplex(T in_val) { + // if we are still getting values for the simplex points, return + if (current_simplex_index < value_cache.size()) { + // assign the value to the current simplex point + value_cache[current_simplex_index].first = true; + value_cache[current_simplex_index].second = in_val; + if (verbose) { + std::cout << "simplex " << current_simplex_index << " " + << simplex[current_simplex_index].to_string() << " = " + << in_val << std::endl; + } + current_simplex_index++; + if (current_simplex_index < value_cache.size()) { + return; + } + } + // simplex populated, check for convergence and prep for reflection + check_for_convergence(); + } + + void check_for_convergence(void) { + // if we have values for all the simplex points, find the + // best, worst, and second worst + T val = value_cache[0].second; + biggest_val = val; + smallest_val = val; + second_biggest_val = val; + biggest_idx = 0; + smallest_idx = 0; + // iterate over the other points in the simplex + for (unsigned int i = 1; i < simplex.size(); i++) { + // if we have a value, get it. + val = value_cache[i].second; + // is this value the biggest? + if (val > biggest_val) { + biggest_idx = i; + biggest_val = val; + // ...or is it the smallest? + } else if (val < smallest_val) { + smallest_idx = i; + smallest_val = val; + } + } + // at this point, we should know the biggest and smallest value + // and the simplexes that generated them + + // Calculate the difference of function values and the distance between + // points in the simplex, so that we can know when to stop the + // optimization + T max_val_diff = 0; + T max_point_diff = 0; + for (unsigned int i = 0; i < simplex.size(); i++) { + val = value_cache[i].second; + // find the second biggest value, save it for later reflection + if (i != biggest_idx and val > second_biggest_val) { + second_biggest_val = val; + // is this NOT the smallest value in the current set of simplex + // points? + } else if (i != smallest_idx) { + // how far is this point from the current best, and is it the + // furthest point from the best? + if (std::abs(val - smallest_val) > max_val_diff) { + max_val_diff = std::abs(val - smallest_val); + } + // get the manhattan distance of the vector between this point + // and the smallest one we have seen so far + T diff = (simplex[i] - simplex[smallest_idx]).length(); + // how "far" is the point from the smallest point? + if (diff > max_point_diff) { + max_point_diff = diff; + } + } + } + // have we converged? either by being within tolerance of the best - + // worst or by being within the tolerance of a point distance? + if ((max_val_diff <= tol_fun or max_point_diff <= tol_x) or + (func_evals_count >= max_fun_evals) or (niter >= max_iter)) { + res = simplex[smallest_idx].vec(); + std::cout << "Converged after " << niter << " iterations." + << std::endl; + std::cout << "Total func evaluations: " << func_evals_count + << std::endl; + _converged = true; + return; + /* + } else { + std::cout << "Not converged: " << max_val_diff << " value difference." + << std::endl; + std::cout << "Not converged: " << max_point_diff << " point difference." + << std::endl; + */ + } + + // not converged? + // Calculate the centroid of our current set + x_bar.resize(dimension); + for (unsigned int i = 0; i < dimension; i++) + x_bar(i) = 0; + for (unsigned int i = 0; i < simplex.size(); i++) { + if (i != biggest_idx) + x_bar += simplex[i]; + } + x_bar /= dimension; + + // advance to REFLECTION + stage = REFLECTION; + return; + } + void evaluate_reflection(void) { + niter++; + if (verbose) { + std::cout << "reflection = " << x_r.to_string() << " " + << reflection_val << std::endl; + } + // is it better than our current best? + if (reflection_val < smallest_val) { + stage = EXPANSION; + return; + } else if (reflection_val >= second_biggest_val) { + stage = CONTRACTION; + return; + } else { + // Reflection is good enough + // replace the worst point with the reflection point. + simplex[biggest_idx] = x_r; + value_cache[biggest_idx].second = reflection_val; + // simplex populated, check for convergence and prep for reflection + check_for_convergence(); + } + // reflection fell through, so stay in reflection stage - but get a new + // point + stage = REFLECTION; + } + void evaluate_expansion(void) { + // evaluate the expansion point + if (verbose) { + std::cout << "expansion = " << x_e.to_string() << " " + << expansion_val << std::endl; + } + // is the expansion point better than the reflection point? + if (expansion_val < reflection_val) { + // replace the worst simplex point with our new expansion point + simplex[biggest_idx] = x_e; + value_cache[biggest_idx].second = expansion_val; + // simplex populated, check for convergence and prep for reflection + check_for_convergence(); + } else { + // otherwise, replace our worst simplex with our reflection point + simplex[biggest_idx] = x_r; + value_cache[biggest_idx].second = reflection_val; + // simplex populated, check for convergence and prep for reflection + check_for_convergence(); + } + if (current_simplex_index > dimension) { + stage = REFLECTION; + } else { + // stage = SIMPLEX; + stage = REFLECTION; + } + } + void evaluate_contraction(void) { + // evaluate the contraction point + if (verbose) { + std::cout << "contraction = " << x_c.to_string() << " " + << contraction_val << std::endl; + } + // is the contraction better than the reflection or the known worst? + if ((outside and contraction_val <= reflection_val) or + (not outside and contraction_val <= biggest_val)) { + // replace the known worst with the contraction + simplex[biggest_idx] = x_c; + value_cache[biggest_idx].second = contraction_val; + // simplex populated, check for convergence and prep for reflection + check_for_convergence(); + stage = REFLECTION; + } else { + // Shrinking + if (verbose) { + std::cout << "shrinking" << std::endl; + } + // we take the whole simplex, and move every point towards the + // current best candidate + for (unsigned int i = 0; i < dimension; i++) { + if (i != smallest_idx) { + simplex[i] = simplex[smallest_idx] + + delta * (simplex[i] - simplex[smallest_idx]); + value_cache[i].first = false; + } + } + // have to re-evaluate the new simplex, so reset. + current_simplex_index = 0; + stage = SIMPLEX; + } + } +}; + +}; // namespace nelder_mead +}; // namespace internal +}; // namespace apex diff --git a/src/unit_tests/Kokkos/CMakeLists.txt b/src/unit_tests/Kokkos/CMakeLists.txt index 39c86f5e..7ec3c7aa 100644 --- a/src/unit_tests/Kokkos/CMakeLists.txt +++ b/src/unit_tests/Kokkos/CMakeLists.txt @@ -10,6 +10,7 @@ link_directories (${APEX_BINARY_DIR}/src/apex) set(example_programs simple two_var + mm2d_tiling ) foreach(example_program ${example_programs}) diff --git a/src/unit_tests/Kokkos/mm2d_tiling.cpp b/src/unit_tests/Kokkos/mm2d_tiling.cpp new file mode 100644 index 00000000..42aada15 --- /dev/null +++ b/src/unit_tests/Kokkos/mm2d_tiling.cpp @@ -0,0 +1,325 @@ +#include + +#include + +#include +#include // cbrt +#include +#include +#include +#include +#include +#include +#include + +const int M=128; +const int N=128; +const int P=128; +const std::string mm2D{"mm2D"}; +enum schedulers{StaticSchedule, DynamicSchedule}; +static const std::string scheduleNames[] = {"static", "dynamic"}; + +using matrix2d = Kokkos::View; +namespace KTE = Kokkos::Tools::Experimental; +namespace KE = Kokkos::Experimental; +constexpr int lowerBound{100}; +constexpr int upperBound{999}; + +// Helper function to generate tile sizes +std::vector factorsOf(const int &size){ + std::vector factors; + for(int i=1; i makeRange(const int &size){ + std::vector range; + for(int i=2; i<=size; i+=2){ + range.push_back(i); + } + return range; +} + +// helper function for human output +void reportOptions(std::vector& candidates, + std::string name, size_t size) { + std::cout<<"Tiling options for " << name << "="< candidates = factorsOf(limit); + reportOptions(candidates, name, limit); + // create our variable object + KTE::VariableInfo out_info; + // set the variable details + out_info.type = KTE::ValueType::kokkos_value_int64; + out_info.category = KTE::StatisticalCategory::kokkos_value_categorical; + out_info.valueQuantity = KTE::CandidateValueType::kokkos_value_set; + out_info.candidates = KTE::make_candidate_set(candidates.size(),candidates.data()); + // declare the variable + out_value_id = KTE::declare_output_type(varname,out_info); + // return the id + return out_value_id; +} + +// helper function for declaring input size variables +size_t declareInputViewSize(std::string varname, int64_t size) { + size_t in_value_id; + // create a 'vector' of value(s) + std::vector candidates = {size}; + // create our variable object + KTE::VariableInfo in_info; + // set the variable details + in_info.type = KTE::ValueType::kokkos_value_int64; + in_info.category = KTE::StatisticalCategory::kokkos_value_categorical; + in_info.valueQuantity = KTE::CandidateValueType::kokkos_value_set; + in_info.candidates = KTE::make_candidate_set(candidates.size(),candidates.data()); + // declare the variable + in_value_id = KTE::declare_input_type(varname,in_info); + // return the id + return in_value_id; +} + +// helper function for declaring scheduler variable +size_t declareOutputSchedules(std::string varname) { + // create a vector of potential values + std::vector candidates_schedule = {StaticSchedule,DynamicSchedule}; + // create our variable object + KTE::VariableInfo schedule_out_info; + // set the variable details + schedule_out_info.type = KTE::ValueType::kokkos_value_int64; + schedule_out_info.category = KTE::StatisticalCategory::kokkos_value_categorical; + schedule_out_info.valueQuantity = KTE::CandidateValueType::kokkos_value_set; + schedule_out_info.candidates = KTE::make_candidate_set(candidates_schedule.size(),candidates_schedule.data()); + // declare the variable + size_t schedule_out_value_id = KTE::declare_output_type(varname,schedule_out_info); + // return the id + return schedule_out_value_id; +} + +// helper function for declaring output tread count variable +size_t declareOutputThreadCount(std::string varname, size_t limit) { + size_t out_value_id; + // create a vector of potential values + std::vector candidates = makeRange(limit); + // create our variable object + KTE::VariableInfo out_info; + // set the variable details + out_info.type = KTE::ValueType::kokkos_value_int64; + out_info.category = KTE::StatisticalCategory::kokkos_value_categorical; + out_info.valueQuantity = KTE::CandidateValueType::kokkos_value_set; + out_info.candidates = KTE::make_candidate_set(candidates.size(),candidates.data()); + // declare the variable + out_value_id = KTE::declare_output_type(varname,out_info); + // return the id + return out_value_id; +} + +int main(int argc, char *argv[]){ + // surely there is a way to get this from Kokkos? + bool tuning = false; + char * tmp{getenv("APEX_KOKKOS_TUNING")}; + if (tmp != nullptr) { + std::string tmpstr {tmp}; + if (tmpstr.compare("1") == 0) { + tuning = true; + } + } + // initialize Kokkos + Kokkos::initialize(argc, argv); + { + // print the Kokkos configuration + Kokkos::print_configuration(std::cout, false); + // seed the random number generator, sure + srand(time(0)); + + /* Declare/Init re,ar1,ar2 */ + matrix2d ar1("array1",M,N), ar2("array2",N,P), re("Result",M,P); + initArray(ar1, M, N); + initArray(ar2, N, P); + zeroArray(re, M, P); + + // Context variable setup - needed to generate a unique context hash for tuning. + // Declare the variables and store the variable IDs + size_t id[5]; + id[0] = 1; // default input for the region name ("mm2D") + id[1] = 2; // default input for the region type ("parallel_for") + id[2] = declareInputViewSize("matrix_size_M", M); + id[3] = declareInputViewSize("matrix_size_N", N); + id[4] = declareInputViewSize("matrix_size_P", P); + + // create an input vector of variables with name, loop type, and view sizes. + std::vector input_vector{ + KTE::make_variable_value(id[0], mm2D), + KTE::make_variable_value(id[1], "parallel_for"), + KTE::make_variable_value(id[2], int64_t(M)), + KTE::make_variable_value(id[3], int64_t(N)), + KTE::make_variable_value(id[4], int64_t(P)) + }; + + // Declare the variables and store the variable IDs + size_t out_value_id[5]; + + // Tuning tile size - setup + out_value_id[0] = declareOutputTileSize("M", "ti_out", M); + out_value_id[1] = declareOutputTileSize("N", "tj_out", N); + out_value_id[2] = declareOutputTileSize("P", "tk_out", P); + // Tuning tile size - end setup + + // scheduling policy - setup + out_value_id[3] = declareOutputSchedules("schedule_out"); + // scheduling policy - end setup + + // thread count - setup + int64_t max_threads = std::min(std::thread::hardware_concurrency(), + (unsigned int)(Kokkos::OpenMP::concurrency())); + out_value_id[4] = declareOutputThreadCount("thread_count", max_threads); + // thread count - end setup + + //The second argument to make_varaible_value might be a default value + std::vector answer_vector{ + KTE::make_variable_value(out_value_id[0], int64_t(1)), + KTE::make_variable_value(out_value_id[1], int64_t(1)), + KTE::make_variable_value(out_value_id[2], int64_t(1)), + KTE::make_variable_value(out_value_id[3], int64_t(StaticSchedule)), + KTE::make_variable_value(out_value_id[4], int64_t(max_threads)) + }; + + /* Declare the kernel that does the work */ + const auto kernel = KOKKOS_LAMBDA(int i, int j, int k){ + re(i,j) += ar1(i,j) * ar2(j,k); + }; + + /* Iterate max_iterations times, so that we can explore the search + * space. Not all searches will converge - we have a large space! + * It's likely that exhaustive search will fail to converge. */ + for (int i = 0 ; i < Impl::max_iterations ; i++) { + // request a context id + size_t context = KTE::get_new_context_id(); + // start the context + KTE::begin_context(context); + + // set the input values for the context + KTE::set_input_values(context, input_vector.size(), input_vector.data()); + // request new output values for the context + KTE::request_output_values(context, answer_vector.size(), answer_vector.data()); + + // get the tiling factors + int ti,tj,tk; + ti = answer_vector[0].value.int_value; + tj = answer_vector[1].value.int_value; + tk = answer_vector[2].value.int_value; + // get our schedule and thread count + int scheduleType = answer_vector[3].value.int_value; + // there's probably a better way to set the thread count? + int num_threads = answer_vector[4].value.int_value; + int leftover_threads = max_threads - answer_vector[4].value.int_value; + + // no tuning? + if (!tuning) { + // Report the tuning, if desired + /* + std::cout << "Tiling: default, "; + std::cout << "Schedule: default "; + std::cout << "Threads: default "; + std::cout << std::endl; + */ + + // default scheduling policy, default tiling + Kokkos::MDRangePolicy> default_policy({0,0,0},{M,N,P}); + Kokkos::parallel_for( + mm2D, default_policy, KOKKOS_LAMBDA(int i, int j, int k){ + re(i,j) += ar1(i,j) * ar2(j,k); + } + ); + // use static schedule? + } else if (scheduleType == StaticSchedule) { + // Report the tuning, if desired + /* + std::cout << "Tiling: [" << ti << "," << tj << "," << tk << "], "; + std::cout << "Schedule: " << scheduleNames[scheduleType] << ", "; + std::cout << "Threads: " << answer_vector[4].value.int_value; + std::cout << std::endl; + */ + + // if using max threads, no need to partition + if (num_threads == max_threads) { + // static scheduling, tuned tiling + Kokkos::MDRangePolicy, + Kokkos::Rank<3>> static_policy({0,0,0},{M,N,P},{ti,tj,tk}); + Kokkos::parallel_for(mm2D, static_policy, kernel); + } else { + // partition the space so we can tune the number of threads + auto instances = KE::partition_space(Kokkos::OpenMP(), + num_threads, leftover_threads); + // static scheduling, tuned tiling + Kokkos::MDRangePolicy, + Kokkos::Rank<3>> static_policy(instances[0],{0,0,0},{M,N,P},{ti,tj,tk}); + Kokkos::parallel_for(mm2D, static_policy, kernel); + } + } else { + /* + // Report the tuning, if desired + std::cout << "Tiling: [" << ti << "," << tj << "," << tk << "], "; + std::cout << "Schedule: " << scheduleNames[scheduleType] << ", "; + std::cout << "Threads: " << answer_vector[4].value.int_value; + std::cout << std::endl; + */ + + // if using max threads, no need to partition + if (num_threads == max_threads) { + // dynamic scheduling, tuned tiling + Kokkos::MDRangePolicy, + Kokkos::Rank<3>> dynamic_policy({0,0,0},{M,N,P},{ti,tj,tk}); + Kokkos::parallel_for( + mm2D, dynamic_policy, kernel); + } else { + // partition the space so we can tune the number of threads + auto instances = KE::partition_space(Kokkos::OpenMP(), + num_threads, leftover_threads); + // dynamic scheduling, tuned tiling + Kokkos::MDRangePolicy, + Kokkos::Rank<3>> dynamic_policy(instances[0],{0,0,0},{M,N,P},{ti,tj,tk}); + Kokkos::parallel_for( + mm2D, dynamic_policy, kernel); + } + } + // end the context + KTE::end_context(context); + } + } + Kokkos::finalize(); +} \ No newline at end of file diff --git a/src/unit_tests/Kokkos/tuning_playground.hpp b/src/unit_tests/Kokkos/tuning_playground.hpp index ad496d1d..94a724e4 100644 --- a/src/unit_tests/Kokkos/tuning_playground.hpp +++ b/src/unit_tests/Kokkos/tuning_playground.hpp @@ -3,9 +3,12 @@ #include #include +#include namespace Impl { +constexpr const int max_iterations{1000}; + struct empty {}; template typename TupleLike, @@ -40,12 +43,10 @@ auto setup_helper(const Setup &setup, int num_iters, std::true_type) { template void tuned_kernel(int argc, char* argv[], Setup setup, Tunable tunable){ - int num_iters = 100000; - bool tuned_internals; - bool found_tuning_tool; - bool print_progress; + int num_iters = 1000; Kokkos::initialize(argc, argv); { + Kokkos::print_configuration(std::cout, false); using emptiness = typename std::is_same::type; auto kernel_data = Impl::setup_helper(setup, num_iters, emptiness{}); @@ -74,7 +75,7 @@ size_t create_categorical_int_tuner(std::string name, size_t num_options){ info.type = ValueType::kokkos_value_int64; info.valueQuantity = CandidateValueType::kokkos_value_set; std::vector options; - for(int x=0;x -void fastest_of(const std::string& label, Implementations... implementations){ +void fastest_of(const std::string& label, const size_t count, Implementations... implementations){ using namespace Kokkos::Tools::Experimental; auto tuner_iter = [&]() { auto my_tuner = ids_for_kernels.find(label); @@ -111,11 +112,19 @@ void fastest_of(const std::string& label, Implementations... implementations){ auto input_id = create_fastest_implementation_id(); VariableValue picked_implementation = make_variable_value(var_id,int64_t(0)); VariableValue which_kernel = make_variable_value(var_id,label.c_str()); + which_kernel.value.int_value = -1; auto context_id = get_new_context_id(); begin_context(context_id); - set_input_values(context_id, 1, &which_kernel); - request_output_values(context_id, 1, &picked_implementation); - fastest_of_helper(picked_implementation.value.int_value, implementations...); + set_input_values(context_id, 1, &picked_implementation); + request_output_values(context_id, 1, &which_kernel); + // if we didn't get a prediction, just alternate between methods. + if (which_kernel.value.int_value < 0) { + static int flipper{0}; + fastest_of_helper(flipper, implementations...); + flipper = (flipper + 1) % count; + } else { + fastest_of_helper(which_kernel.value.int_value, implementations...); + } end_context(context_id); } From 8a8e9d3df252438bbdc9be3db804625b676ad092 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Mon, 16 Sep 2024 20:04:29 -0700 Subject: [PATCH 05/10] Adding automatic search strategy. The basic logic for automatic is: - if there is one variable, and it is a set of categoricals, use exhaustive. - if there is one variable and it is an integer range, use simulated annealing. - if there is one variable and it is a double, use nelder mead. - if there are more than one variable, and any of them are categorical sets, then use genetic search - if any of them are integer ranges, use simulated annealing - if all double ranges, use nelder mead. --- src/apex/apex_kokkos_tuning.cpp | 80 +++++++++++++++++++++++++++++++-- src/apex/apex_policies.hpp | 3 +- src/apex/apex_types.h | 2 +- 3 files changed, 79 insertions(+), 6 deletions(-) diff --git a/src/apex/apex_kokkos_tuning.cpp b/src/apex/apex_kokkos_tuning.cpp index f138f6e5..1fe90cd0 100644 --- a/src/apex/apex_kokkos_tuning.cpp +++ b/src/apex/apex_kokkos_tuning.cpp @@ -273,8 +273,11 @@ class KokkosSession { } else if (strncmp(apex::apex_options::kokkos_tuning_policy(), "nelder_mead", strlen("nelder_mead")) == 0) { strategy = apex_ah_tuning_strategy::NELDER_MEAD_INTERNAL; + } else if (strncmp(apex::apex_options::kokkos_tuning_policy(), + "automatic", strlen("automatic")) == 0) { + strategy = apex_ah_tuning_strategy::AUTOMATIC; } else { - strategy = apex_ah_tuning_strategy::NELDER_MEAD; + strategy = apex_ah_tuning_strategy::AUTOMATIC; } } public: @@ -357,6 +360,25 @@ void KokkosSession::saveOutputVar(size_t id, Variable * var) { all_vars.insert(std::make_pair(id, var)); } +std::string strategy_to_string(std::shared_ptr request) { + if (request->get_strategy() == apex_ah_tuning_strategy::APEX_RANDOM) { + return std::string("random"); + } + if (request->get_strategy() == apex_ah_tuning_strategy::APEX_EXHAUSTIVE) { + return std::string("exhaustive"); + } + if (request->get_strategy() == apex_ah_tuning_strategy::SIMULATED_ANNEALING) { + return std::string("simulated annealing"); + } + if (request->get_strategy() == apex_ah_tuning_strategy::GENETIC_SEARCH) { + return std::string("genetic search"); + } + if (request->get_strategy() == apex_ah_tuning_strategy::NELDER_MEAD_INTERNAL) { + return std::string("nelder mead"); + } + return "unknown?"; +} + void KokkosSession::writeCache(void) { if(use_history) { return; } if(!saveCache) { return; } @@ -388,6 +410,12 @@ void KokkosSession::writeCache(void) { // always write the random search out bool converged = request->has_converged() || strategy == apex_ah_tuning_strategy::APEX_RANDOM; + results << " Strategy: \"" << + strategy_to_string(request); + if (strategy == apex_ah_tuning_strategy::AUTOMATIC) { + results << " (auto)"; + } + results << "\"" << std::endl; results << " Converged: " << (converged ? "true" : "false") << std::endl; if (converged) { @@ -501,6 +529,8 @@ void KokkosSession::parseContextCache(std::ifstream& results) { std::getline(results, line); std::string name = line.substr(line.find(delimiter)+2); name.erase(std::remove(name.begin(),name.end(),'\"'),name.end()); + // strategy + std::getline(results, line); // converged? std::getline(results, line); std::string converged = line.substr(line.find(delimiter)+2); @@ -968,8 +998,52 @@ bool handle_start(const std::string & name, const size_t vars, }; request->set_metric(metric); - // Set apex_openmp_policy_tuning_strategy - request->set_strategy(session.strategy); + // Set apex tuning strategy + if (session.strategy == apex_ah_tuning_strategy::AUTOMATIC) { + // just one variable? + if (vars == 1) { + auto id = values[0].type_id; + Variable* var{session.outputs[id]}; + // and it's a small set of candidate values? + if (var->info.valueQuantity == kokkos_value_set && + var->info.candidates.set.size < 4) { + request->set_strategy(apex_ah_tuning_strategy::APEX_EXHAUSTIVE); + // if integer, use simulated annealing + } else if (var->info.type == kokkos_value_int64) { + request->set_strategy(apex_ah_tuning_strategy::SIMULATED_ANNEALING); + // if double, use nelder mead + } else if (var->info.type == kokkos_value_double) { + request->set_strategy(apex_ah_tuning_strategy::NELDER_MEAD_INTERNAL); + } + // more than one variable... + } else { + // are any of them categorical? + bool haveSet = false; + bool allDouble = true; + for (size_t i = 0 ; i < vars ; i++) { + auto id = values[i].type_id; + Variable* var{session.outputs[id]}; + if (var->info.valueQuantity == kokkos_value_set) { + haveSet = true; + allDouble = false; + } else if (var->info.type == kokkos_value_int64) { + allDouble = false; + } + } + // if have a categorical set, use genetic search + if (haveSet) { + request->set_strategy(apex_ah_tuning_strategy::GENETIC_SEARCH); + // if all double values, use nelder mead + } else if (allDouble) { + request->set_strategy(apex_ah_tuning_strategy::NELDER_MEAD_INTERNAL); + // as default, use simulated annealing + } else { + request->set_strategy(apex_ah_tuning_strategy::SIMULATED_ANNEALING); + } + } + } else { + request->set_strategy(session.strategy); + } request->set_radius(0.5); request->set_aggregation_times(3); // min, max, mean diff --git a/src/apex/apex_policies.hpp b/src/apex/apex_policies.hpp index 9a9bf4ef..d1622eb7 100644 --- a/src/apex/apex_policies.hpp +++ b/src/apex/apex_policies.hpp @@ -42,8 +42,7 @@ enum class apex_param_type : int {NONE, LONG, DOUBLE, ENUM}; enum class apex_ah_tuning_strategy : int { EXHAUSTIVE, RANDOM, NELDER_MEAD, NELDER_MEAD_INTERNAL, PARALLEL_RANK_ORDER, SIMULATED_ANNEALING, - APEX_EXHAUSTIVE, APEX_RANDOM, - GENETIC_SEARCH}; + APEX_EXHAUSTIVE, APEX_RANDOM, GENETIC_SEARCH, AUTOMATIC}; struct apex_tuning_session; class apex_tuning_request; diff --git a/src/apex/apex_types.h b/src/apex/apex_types.h index d23c463f..ab3f4e8c 100644 --- a/src/apex/apex_types.h +++ b/src/apex/apex_types.h @@ -382,7 +382,7 @@ inline unsigned int sc_nprocessors_onln(void) APEX_DEFAULT_OTF2_ARCHIVE_NAME, "OTF2 trace filename.") \ macro (APEX_EVENT_FILTER_FILE, task_event_filter_file, char*, "", "File containing names of timers to include/exclude during data collection.") \ macro (APEX_KOKKOS_TUNING_CACHE, kokkos_tuning_cache, char*, "", "Filename contining Kokkos autotuned results, tuned offline.") \ - macro (APEX_KOKKOS_TUNING_POLICY, kokkos_tuning_policy, char*, "simulated_annealing", "Kokkos autotuning policy: random, exhaustive, simulated_annealing, nelder_mead.") \ + macro (APEX_KOKKOS_TUNING_POLICY, kokkos_tuning_policy, char*, "simulated_annealing", "Kokkos autotuning policy: random, exhaustive, simulated_annealing, nelder_mead, automatic.") \ macro (APEX_ROCPROF_METRICS, rocprof_metrics, char*, "", "List of metrics to periodically sample with the Rocprofiler library (see /opt/rocm/rocprofiler/lib/metrics.xml).") \ macro (APEX_NVTX_LIBRARY, nvtx_library, char*, "libnvToolsExt.so", "With NVTX listener, specify the location of libnvToolsExt.so.") // macro (APEX_ROCPROF_METRICS, rocprof_metrics, char*, "MemUnitBusy,MemUnitStalled,VALUUtilization,VALUBusy,SALUBusy,L2CacheHit,WriteUnitStalled,ALUStalledByLDS,LDSBankConflict", "") From f7b37e071d07eaf4aa996f7b1adf4377347205d7 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Tue, 17 Sep 2024 07:43:09 -0700 Subject: [PATCH 06/10] Don't continue to evaluate after convergence --- src/apex/nelder_mead_internal.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/apex/nelder_mead_internal.h b/src/apex/nelder_mead_internal.h index f619bf93..53f5f593 100644 --- a/src/apex/nelder_mead_internal.h +++ b/src/apex/nelder_mead_internal.h @@ -329,6 +329,8 @@ template class Searcher { return res; } void report(T val) { + if (converged()) + return; switch (stage) { case SIMPLEX: evaluate_simplex(val); From 361a2ecbe2aaec308404fa5fd69de732eab602a0 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Tue, 17 Sep 2024 12:07:50 -0700 Subject: [PATCH 07/10] DEbugging automatic search strategy, tweaking nelder mead convergence criteria. --- src/apex/exhaustive.cpp | 15 ++++++++++++--- src/apex/genetic_search.cpp | 8 +++++--- src/apex/nelder_mead.cpp | 20 +++++++++++++++----- src/apex/nelder_mead.hpp | 7 ++++++- src/apex/nelder_mead_internal.h | 6 +++--- src/apex/random.cpp | 8 +++++--- src/apex/simulated_annealing.cpp | 8 +++++--- 7 files changed, 51 insertions(+), 21 deletions(-) diff --git a/src/apex/exhaustive.cpp b/src/apex/exhaustive.cpp index c118626d..f7ab3e1e 100644 --- a/src/apex/exhaustive.cpp +++ b/src/apex/exhaustive.cpp @@ -2,6 +2,7 @@ #include #include #include +#include "apex_options.hpp" namespace apex { @@ -32,6 +33,12 @@ size_t Exhaustive::get_max_iterations() { } } } + // if we are only chosing between a couple of items, then + // increase our number of iterations so that we ensure good + // coverage. + if (vars.size() == 1 || max_iter < 10) { + max_iter = max_iter * std::max(10, apex_options::kokkos_tuning_window()+1); + } // want to see multiple values of each one //std::cout << "Max iterations: " << max_iter << std::endl; return max_iter; @@ -67,10 +74,12 @@ void Exhaustive::evaluate(double new_cost) { if (new_cost < cost) { if (new_cost < best_cost) { best_cost = new_cost; - std::cout << "New best! " << new_cost << " k: " << k + std::cout << "Exhaustive search: New best! " << new_cost << " k: " << k << " kmax: " << kmax; - for (auto& v : vars) { v.second.save_best(); } - for (auto& v : vars) { std::cout << ", " << v.first << ": " << v.second.toString(); } + for (auto& v : vars) { + std::cout << ", " << v.first << ": " << v.second.toString(); + v.second.save_best(); + } std::cout << std::endl; } cost = new_cost; diff --git a/src/apex/genetic_search.cpp b/src/apex/genetic_search.cpp index 5c581cac..7d459da0 100644 --- a/src/apex/genetic_search.cpp +++ b/src/apex/genetic_search.cpp @@ -167,10 +167,12 @@ void GeneticSearch::evaluate(double new_cost) { if (new_cost < cost) { if (new_cost < best_cost) { best_cost = new_cost; - std::cout << "New best! " << new_cost << " k: " << k + std::cout << "Genetic Search: New best! " << new_cost << " k: " << k << " kmax: " << kmax; - for (auto& v : vars) { v.second.save_best(); } - for (auto& v : vars) { std::cout << ", " << v.first << ": " << v.second.toString(); } + for (auto& v : vars) { + std::cout << ", " << v.first << ": " << v.second.toString(); + v.second.save_best(); + } std::cout << std::endl; } cost = new_cost; diff --git a/src/apex/nelder_mead.cpp b/src/apex/nelder_mead.cpp index 43475adb..602a3149 100644 --- a/src/apex/nelder_mead.cpp +++ b/src/apex/nelder_mead.cpp @@ -35,8 +35,12 @@ void NelderMead::start(void) { init_simplex.push_back(tmp); } searcher = new apex::internal::nelder_mead::Searcher(init_point, init_simplex, lower_limit, upper_limit, true); - searcher->function_tolerance(100000); - searcher->point_tolerance(0.01); + searcher->function_tolerance(10000); + if (hasDiscrete) { + searcher->point_tolerance(1.0); + } else { + searcher->point_tolerance(0.01); + } } /* @@ -77,12 +81,18 @@ void NelderMead::evaluate(double new_cost) { if (new_cost < cost) { if (new_cost < best_cost) { best_cost = new_cost; - std::cout << "New best! " << new_cost << " k: " << k << std::endl; - for (auto& v : vars) { v.second.save_best(); } - for (auto& v : vars) { std::cout << ", " << v.first << ": " << v.second.toString(); } + std::cout << "Nelder Mead: New best! " << new_cost << " k: " << k; + for (auto& v : vars) { + std::cout << ", " << v.first << ": " << v.second.toString(); + v.second.save_best(); + } std::cout << std::endl; } cost = new_cost; + // if the function evaluation takes a long time (in nanoseconds, remember), increase our tolerance. + auto tmp = std::max((new_cost / 50.0), 1000.0); + std::cout << "new function tolerance: " << tmp << std::endl; + searcher->function_tolerance(tmp); } k++; return; diff --git a/src/apex/nelder_mead.hpp b/src/apex/nelder_mead.hpp index d46df951..831c1c3f 100644 --- a/src/apex/nelder_mead.hpp +++ b/src/apex/nelder_mead.hpp @@ -129,10 +129,11 @@ class NelderMead { const size_t max_iterations{500}; const size_t min_iterations{16}; internal::nelder_mead::Searcher* searcher; + bool hasDiscrete; public: void evaluate(double new_cost); NelderMead() : - kmax(0), k(1), searcher(nullptr) { + kmax(0), k(1), searcher(nullptr), hasDiscrete(false) { cost = std::numeric_limits::max(); best_cost = cost; } @@ -160,6 +161,10 @@ class NelderMead { std::map& get_vars() { return vars; } void add_var(std::string name, Variable var) { vars.insert(std::make_pair(name, var)); + if (var.vtype == VariableType::longtype || + var.vtype == VariableType::stringtype) { + hasDiscrete = true; + } } void start(void); }; diff --git a/src/apex/nelder_mead_internal.h b/src/apex/nelder_mead_internal.h index 53f5f593..26b5ede1 100644 --- a/src/apex/nelder_mead_internal.h +++ b/src/apex/nelder_mead_internal.h @@ -208,8 +208,8 @@ template class Searcher { const std::vector> &initial_simplex = {}, const std::vector &_minimum_limits = {}, const std::vector &_maximum_limits = {}, bool _adaptive = true) - : adaptive(_adaptive), tol_fun(1e-8), tol_x(1e-8), max_iter(100000), - max_fun_evals(100000), current_simplex_index(0), + : adaptive(_adaptive), tol_fun(1e-8), tol_x(1e-8), max_iter(1000), + max_fun_evals(2000), current_simplex_index(0), minimum_limits(_minimum_limits), maximum_limits(_maximum_limits), _converged(false), verbose(false) { initialize(initial_point, initial_simplex); @@ -435,7 +435,7 @@ template class Searcher { } // have we converged? either by being within tolerance of the best - // worst or by being within the tolerance of a point distance? - if ((max_val_diff <= tol_fun or max_point_diff <= tol_x) or + if ((max_val_diff <= tol_fun and max_point_diff <= tol_x) or (func_evals_count >= max_fun_evals) or (niter >= max_iter)) { res = simplex[smallest_idx].vec(); std::cout << "Converged after " << niter << " iterations." diff --git a/src/apex/random.cpp b/src/apex/random.cpp index 6a1274d2..4eb8dd20 100644 --- a/src/apex/random.cpp +++ b/src/apex/random.cpp @@ -68,10 +68,12 @@ void Random::evaluate(double new_cost) { if (new_cost < cost) { if (new_cost < best_cost) { best_cost = new_cost; - std::cout << "New best! " << new_cost << " k: " << k + std::cout << "Random: New best! " << new_cost << " k: " << k << " kmax: " << kmax; - for (auto& v : vars) { v.second.save_best(); } - for (auto& v : vars) { std::cout << ", " << v.first << ": " << v.second.toString(); } + for (auto& v : vars) { + std::cout << ", " << v.first << ": " << v.second.toString(); + v.second.save_best(); + } std::cout << std::endl; } cost = new_cost; diff --git a/src/apex/simulated_annealing.cpp b/src/apex/simulated_annealing.cpp index 994d4085..aa7f9095 100644 --- a/src/apex/simulated_annealing.cpp +++ b/src/apex/simulated_annealing.cpp @@ -58,10 +58,12 @@ void SimulatedAnnealing::evaluate(double new_cost) { for (auto& v : vars) { v.second.choose_neighbor(); } if (new_cost < best_cost) { best_cost = new_cost; - std::cout << "New best! " << new_cost << " k: " << k + std::cout << "Simulated Annealing: New best! " << new_cost << " k: " << k << " kmax: " << kmax << " temp: " << temp; - for (auto& v : vars) { v.second.save_best(); } - for (auto& v : vars) { std::cout << ", " << v.first << ": " << v.second.toString(); } + for (auto& v : vars) { + std::cout << ", " << v.first << ": " << v.second.toString(); + v.second.save_best(); + } std::cout << std::endl; since_restart = 1; } From c9d5ea8e41cc24878354bcc9f79b86b43342eb12 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Tue, 17 Sep 2024 15:45:13 -0700 Subject: [PATCH 08/10] Cleaning up compiler warnings --- src/apex/nelder_mead.hpp | 2 +- src/apex/nelder_mead_internal.h | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/apex/nelder_mead.hpp b/src/apex/nelder_mead.hpp index 831c1c3f..ffe0d6fe 100644 --- a/src/apex/nelder_mead.hpp +++ b/src/apex/nelder_mead.hpp @@ -33,7 +33,7 @@ class Variable { void * value; // for the client to get the values size_t maxlen; Variable () = delete; - Variable (VariableType vtype, void * ptr) : vtype(vtype), current_index(0), + Variable (VariableType vtype, void * ptr) : limits({0.0,0.0}), vtype(vtype), current_index(0), best_index(0), current_value(0), best_value(0), value(ptr), maxlen(0) { } void set_current_value() { if (vtype == VariableType::continuous) { diff --git a/src/apex/nelder_mead_internal.h b/src/apex/nelder_mead_internal.h index 26b5ede1..a21d832d 100644 --- a/src/apex/nelder_mead_internal.h +++ b/src/apex/nelder_mead_internal.h @@ -279,7 +279,6 @@ template class Searcher { case SIMPLEX: { // return the next simplex point return simplex[current_simplex_index].vec(); - break; } case REFLECTION: { // Calculate the reflection point @@ -291,7 +290,6 @@ template class Searcher { x_r.enforce_max(maximum_limits); // return the reflection point return x_r.vec(); - break; } case EXPANSION: { // Calculate the Expansion point @@ -303,7 +301,6 @@ template class Searcher { x_e.enforce_max(maximum_limits); // return the expansion point return x_e.vec(); - break; } case CONTRACTION: { // Compute the contraction point @@ -320,7 +317,6 @@ template class Searcher { } // return the contraction point return x_c.vec(); - break; } default: { assert(false); From 5c300398a906c0f07a3093c1b1810019d7bf9bda Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Tue, 17 Sep 2024 15:54:23 -0700 Subject: [PATCH 09/10] Man, compilers are just garbage. --- src/apex/nelder_mead.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/apex/nelder_mead.hpp b/src/apex/nelder_mead.hpp index ffe0d6fe..82e2bb80 100644 --- a/src/apex/nelder_mead.hpp +++ b/src/apex/nelder_mead.hpp @@ -24,7 +24,7 @@ class Variable { std::vector dvalues; std::vector lvalues; std::vector svalues; - std::array limits; + std::vector limits; VariableType vtype; size_t current_index; size_t best_index; @@ -33,7 +33,7 @@ class Variable { void * value; // for the client to get the values size_t maxlen; Variable () = delete; - Variable (VariableType vtype, void * ptr) : limits({0.0,0.0}), vtype(vtype), current_index(0), + Variable (VariableType vtype, void * ptr) : vtype(vtype), current_index(0), best_index(0), current_value(0), best_value(0), value(ptr), maxlen(0) { } void set_current_value() { if (vtype == VariableType::continuous) { @@ -102,7 +102,8 @@ class Variable { //return ((double)maxlen / 2.0); return 0.5; } - const std::array& get_limits(void) { + const std::vector& get_limits(void) { + limits.reserve(2); // if our variable is continuous, we have been initialized with // two values, the min and the max if (vtype == VariableType::continuous) { From 8354e6d31ee2c2695ff28cffe0cf4849ab1d59bc Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Wed, 18 Sep 2024 12:22:46 -0700 Subject: [PATCH 10/10] Fixing bug in determining tree node for kokkos tuning --- src/apex/apex_kokkos_tuning.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/apex/apex_kokkos_tuning.cpp b/src/apex/apex_kokkos_tuning.cpp index 1fe90cd0..880ab9df 100644 --- a/src/apex/apex_kokkos_tuning.cpp +++ b/src/apex/apex_kokkos_tuning.cpp @@ -1211,10 +1211,12 @@ void kokkosp_request_values( Kokkos_Tools_VariableValue* tuningVariableValues) { if (!apex::apex_options::use_kokkos_tuning()) { return; } // first, get the current timer node in the task tree - auto tlt = apex::thread_instance::get_top_level_timer(); + //auto tlt = apex::thread_instance::get_top_level_timer(); + auto tlt = apex::thread_instance::get_current_profiler(); std::string tree_node{"default"}; if (tlt != nullptr) { - tree_node = tlt->tree_node->getName(); + //tree_node = tlt->tt_ptr->tree_node->getName(); + tree_node = tlt->tt_ptr->task_id->get_name(); } // don't track memory in this function. apex::in_apex prevent_memory_tracking;