diff --git a/CMakeLists.txt b/CMakeLists.txt index e5f984f5..64f76ccd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -688,6 +688,10 @@ if(APEX_WITH_KOKKOS) if(APEX_BUILD_TESTS) # Just for testing SET(Kokkos_LIBRARY kokkoscore) + set(Kokkos_ENABLE_OPENMP ON CACHE BOOL "" FORCE) + set(Kokkos_ENABLE_SERIAL ON CACHE BOOL "" FORCE) + set(Kokkos_ARCH_NATIVE ON CACHE BOOL "" FORCE) + set(Kokkos_ENABLE_TUNING ON CACHE BOOL "" FORCE) add_subdirectory(kokkos) endif(APEX_BUILD_TESTS) endif() diff --git a/src/apex/CMakeLists_hpx.cmake b/src/apex/CMakeLists_hpx.cmake index 20a9ae12..20b044bb 100644 --- a/src/apex/CMakeLists_hpx.cmake +++ b/src/apex/CMakeLists_hpx.cmake @@ -322,9 +322,11 @@ set(apex_headers dependency_tree.hpp event_listener.hpp exhaustive.hpp + genetic_search.hpp gzstream.hpp handler.hpp memory_wrapper.hpp + nelder_mead.hpp policy_handler.hpp profile.hpp profiler.hpp @@ -361,9 +363,11 @@ set(apex_sources event_listener.cpp event_filter.cpp exhaustive.cpp + genetic_search.cpp gzstream.cpp handler.cpp memory_wrapper.cpp + nelder_mead.cpp nvtx_listener.cpp policy_handler.cpp profile_reducer.cpp diff --git a/src/apex/CMakeLists_standalone.cmake b/src/apex/CMakeLists_standalone.cmake index 46e5d900..20285b56 100644 --- a/src/apex/CMakeLists_standalone.cmake +++ b/src/apex/CMakeLists_standalone.cmake @@ -90,6 +90,7 @@ exhaustive.cpp genetic_search.cpp handler.cpp memory_wrapper.cpp +nelder_mead.cpp nvtx_listener.cpp ${OTF2_SOURCE} ${perfetto_sources} diff --git a/src/apex/apex_kokkos_tuning.cpp b/src/apex/apex_kokkos_tuning.cpp index 2166515c..f138f6e5 100644 --- a/src/apex/apex_kokkos_tuning.cpp +++ b/src/apex/apex_kokkos_tuning.cpp @@ -270,6 +270,9 @@ class KokkosSession { } else if (strncmp(apex::apex_options::kokkos_tuning_policy(), "genetic_search", strlen("genetic_search")) == 0) { strategy = apex_ah_tuning_strategy::GENETIC_SEARCH; + } else if (strncmp(apex::apex_options::kokkos_tuning_policy(), + "nelder_mead", strlen("nelder_mead")) == 0) { + strategy = apex_ah_tuning_strategy::NELDER_MEAD_INTERNAL; } else { strategy = apex_ah_tuning_strategy::NELDER_MEAD; } diff --git a/src/apex/apex_policies.cpp b/src/apex/apex_policies.cpp index df4388d8..284ea7f5 100644 --- a/src/apex/apex_policies.cpp +++ b/src/apex/apex_policies.cpp @@ -914,6 +914,42 @@ int apex_sa_policy(shared_ptr tuning_session, return APEX_NOERROR; } +int apex_nelder_mead_policy(shared_ptr tuning_session, + apex_context const context) { + APEX_UNUSED(context); + if (apex_final) return APEX_NOERROR; // we terminated + std::unique_lock l{shutdown_mutex}; + /* If we are doing nested search contexts, allow us to keep searching + * on outer contexts until all inner contexts have converged! */ + bool force{true}; + if (context.data != nullptr) { + // the context data is a pointer to a boolean value + force = *((bool*)(context.data)); + } + if (tuning_session->nelder_mead_session.converged() && force) { + if (!tuning_session->converged_message) { + tuning_session->converged_message = true; + cout << "APEX: Tuning has converged for session " << tuning_session->id + << "." << endl; + tuning_session->nelder_mead_session.saveBestSettings(); + tuning_session->nelder_mead_session.printBestSettings(); + } + tuning_session->nelder_mead_session.saveBestSettings(); + return APEX_NOERROR; + } + + // get a measurement of our current setting + double new_value = tuning_session->metric_of_interest(); + + /* Report the performance we've just measured. */ + tuning_session->nelder_mead_session.evaluate(new_value); + + /* Request new settings for next time */ + tuning_session->nelder_mead_session.getNewSettings(); + + return APEX_NOERROR; +} + int apex_genetic_policy(shared_ptr tuning_session, apex_context const context) { APEX_UNUSED(context); @@ -1572,6 +1608,67 @@ inline int __sa_setup(shared_ptr return APEX_NOERROR; } +inline int __nelder_mead_setup(shared_ptr + tuning_session, apex_tuning_request & request) { + APEX_UNUSED(tuning_session); + // set up the Simulated annealing! + // iterate over the parameters, and create variables. + using namespace apex::nelder_mead; + for(auto & kv : request.params) { + auto & param = kv.second; + const char * param_name = param->get_name().c_str(); + switch(param->get_type()) { + case apex_param_type::LONG: { + auto param_long = + std::static_pointer_cast(param); + Variable v(VariableType::longtype, param_long->value.get()); + long lvalue = param_long->min; + do { + v.lvalues.push_back(lvalue); + lvalue = lvalue + param_long->step; + } while (lvalue <= param_long->max); + v.set_init(); + tuning_session->nelder_mead_session.add_var(param_name, std::move(v)); + } + break; + case apex_param_type::DOUBLE: { + auto param_double = + std::static_pointer_cast(param); + Variable v(VariableType::doubletype, param_double->value.get()); + double dvalue = param_double->min; + do { + v.dvalues.push_back(dvalue); + dvalue = dvalue + param_double->step; + } while (dvalue <= param_double->max); + v.set_init(); + tuning_session->nelder_mead_session.add_var(param_name, std::move(v)); + } + break; + case apex_param_type::ENUM: { + auto param_enum = + std::static_pointer_cast(param); + Variable v(VariableType::stringtype, param_enum->value.get()); + for(const std::string & possible_value : + param_enum->possible_values) { + v.svalues.push_back(possible_value); + } + v.set_init(); + tuning_session->nelder_mead_session.add_var(param_name, std::move(v)); + } + break; + default: + cerr << + "ERROR: Attempted to register tuning parameter with unknown type." + << endl; + return APEX_ERROR; + } + } + /* request initial settings */ + tuning_session->nelder_mead_session.getNewSettings(); + + return APEX_NOERROR; +} + inline int __genetic_setup(shared_ptr tuning_session, apex_tuning_request & request) { APEX_UNUSED(tuning_session); @@ -1848,6 +1945,16 @@ inline int __common_setup_custom_tuning(shared_ptr } ); } + } else if (request.strategy == apex_ah_tuning_strategy::NELDER_MEAD_INTERNAL) { + status = __nelder_mead_setup(tuning_session, request); + if(status == APEX_NOERROR) { + apex::register_policy( + request.trigger, + [=](apex_context const & context)->int { + return apex_nelder_mead_policy(tuning_session, context); + } + ); + } } else if (request.strategy == apex_ah_tuning_strategy::GENETIC_SEARCH) { status = __genetic_setup(tuning_session, request); if(status == APEX_NOERROR) { diff --git a/src/apex/apex_policies.hpp b/src/apex/apex_policies.hpp index da5eb63a..9a9bf4ef 100644 --- a/src/apex/apex_policies.hpp +++ b/src/apex/apex_policies.hpp @@ -35,10 +35,12 @@ #include "random.hpp" // include the genetic_search class #include "genetic_search.hpp" +// include the nelder_mead class +#include "nelder_mead.hpp" enum class apex_param_type : int {NONE, LONG, DOUBLE, ENUM}; enum class apex_ah_tuning_strategy : int { - EXHAUSTIVE, RANDOM, NELDER_MEAD, + EXHAUSTIVE, RANDOM, NELDER_MEAD, NELDER_MEAD_INTERNAL, PARALLEL_RANK_ORDER, SIMULATED_ANNEALING, APEX_EXHAUSTIVE, APEX_RANDOM, GENETIC_SEARCH}; @@ -82,6 +84,8 @@ class apex_param { tuning_session, apex_tuning_request & request); friend int __genetic_setup(std::shared_ptr tuning_session, apex_tuning_request & request); + friend int __nelder_mead_setup(std::shared_ptr + tuning_session, apex_tuning_request & request); }; class apex_param_long : public apex_param { @@ -121,6 +125,8 @@ class apex_param_long : public apex_param { tuning_session, apex_tuning_request & request); friend int __genetic_setup(std::shared_ptr tuning_session, apex_tuning_request & request); + friend int __nelder_mead_setup(std::shared_ptr + tuning_session, apex_tuning_request & request); }; class apex_param_double : public apex_param { @@ -160,6 +166,8 @@ class apex_param_double : public apex_param { tuning_session, apex_tuning_request & request); friend int __genetic_setup(std::shared_ptr tuning_session, apex_tuning_request & request); + friend int __nelder_mead_setup(std::shared_ptr + tuning_session, apex_tuning_request & request); }; class apex_param_enum : public apex_param { @@ -198,6 +206,8 @@ class apex_param_enum : public apex_param { tuning_session, apex_tuning_request & request); friend int __genetic_setup(std::shared_ptr tuning_session, apex_tuning_request & request); + friend int __nelder_mead_setup(std::shared_ptr + tuning_session, apex_tuning_request & request); }; @@ -340,6 +350,8 @@ class apex_tuning_request { tuning_session, apex_tuning_request & request); friend int __genetic_setup(std::shared_ptr tuning_session, apex_tuning_request & request); + friend int __nelder_mead_setup(std::shared_ptr + tuning_session, apex_tuning_request & request); }; @@ -368,6 +380,8 @@ struct apex_tuning_session { apex::random::Random random_session; // if using genetic, this is the request. apex::genetic::GeneticSearch genetic_session; + // if using nelder mead, this is the request. + apex::nelder_mead::NelderMead nelder_mead_session; bool converged_message = false; // variables related to power throttling diff --git a/src/apex/nelder_mead.cpp b/src/apex/nelder_mead.cpp new file mode 100644 index 00000000..43475adb --- /dev/null +++ b/src/apex/nelder_mead.cpp @@ -0,0 +1,95 @@ +#include "nelder_mead.hpp" +#include +#include +#include +#include +#include + +namespace apex { + +namespace nelder_mead { + +void NelderMead::start(void) { + // create a starting point + std::vector init_point; + for (auto& v : vars) { + init_point.push_back(v.second.get_init()); + } + // create a lower limit + std::vector lower_limit; + std::vector upper_limit; + for (auto& v : vars) { + auto& limits = v.second.get_limits(); + lower_limit.push_back(limits[0]); + upper_limit.push_back(limits[1]); + } + // create a starting simplex - random values in the space, nvars+1 of them + std::vector> init_simplex; + for (size_t i = 0 ; i < (vars.size() + 1) ; i++) { + std::vector tmp; + for (auto& v : vars) { + double r = ((double) std::rand() / (RAND_MAX)); + auto& limits = v.second.get_limits(); + tmp.push_back(r * ((limits[1] + limits[0]) / 2.0)); + } + init_simplex.push_back(tmp); + } + searcher = new apex::internal::nelder_mead::Searcher(init_point, init_simplex, lower_limit, upper_limit, true); + searcher->function_tolerance(100000); + searcher->point_tolerance(0.01); +} + +/* +static std::string vector_to_string(std::vector val) { + std::stringstream ss; + ss << "["; + for (size_t i = 0; i < val.size(); i++) { + ss << val[i]; + ss << (i == val.size() - 1 ? "]" : ","); + } + return ss.str(); +} +*/ + +void NelderMead::getNewSettings() { + if (searcher == nullptr) start(); + // get the next point from the simplex search + auto point = searcher->get_next_point(); + //std::cout << "Next point: " << vector_to_string(point) << std::endl; + size_t i{0}; + for (auto& v : vars) { + // if continuous, we just get the value + if (v.second.vtype == VariableType::continuous) { + v.second.current_value = point[i]; + } else { + // otherwise, we scale the value from [0:maxlen] to our discrete index + double tmp = point[i] * v.second.maxlen; + v.second.current_index = std::min((size_t)(std::trunc(tmp)),(v.second.maxlen-1)); + } + v.second.set_current_value(); + i++; + } +} + +void NelderMead::evaluate(double new_cost) { + // report the result + searcher->report(new_cost); + if (new_cost < cost) { + if (new_cost < best_cost) { + best_cost = new_cost; + std::cout << "New best! " << new_cost << " k: " << k << std::endl; + for (auto& v : vars) { v.second.save_best(); } + for (auto& v : vars) { std::cout << ", " << v.first << ": " << v.second.toString(); } + std::cout << std::endl; + } + cost = new_cost; + } + k++; + return; +} + +} // genetic + +} // apex + + diff --git a/src/apex/nelder_mead.hpp b/src/apex/nelder_mead.hpp new file mode 100644 index 00000000..d46df951 --- /dev/null +++ b/src/apex/nelder_mead.hpp @@ -0,0 +1,169 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "apex_types.h" +#include "apex_assert.h" +#include "nelder_mead_internal.h" + +namespace apex { + +namespace nelder_mead { + +enum class VariableType { doubletype, longtype, stringtype, continuous } ; + +class Variable { +public: + std::vector dvalues; + std::vector lvalues; + std::vector svalues; + std::array limits; + VariableType vtype; + size_t current_index; + size_t best_index; + double current_value; + double best_value; + void * value; // for the client to get the values + size_t maxlen; + Variable () = delete; + Variable (VariableType vtype, void * ptr) : vtype(vtype), current_index(0), + best_index(0), current_value(0), best_value(0), value(ptr), maxlen(0) { } + void set_current_value() { + if (vtype == VariableType::continuous) { + *((double*)(value)) = current_value; + } + else if (vtype == VariableType::doubletype) { + *((double*)(value)) = dvalues[current_index]; + } + else if (vtype == VariableType::longtype) { + *((long*)(value)) = lvalues[current_index]; + } + else { + *((const char**)(value)) = svalues[current_index].c_str(); + } + } + void save_best() { + if (vtype == VariableType::continuous) { + best_value = current_value; + } else { + best_index = current_index; + } + } + void set_init() { + maxlen = (std::max(std::max(dvalues.size(), + lvalues.size()), svalues.size())); + current_index = 0; + set_current_value(); + } + std::string getBest() { + if (vtype == VariableType::continuous) { + *((double*)(value)) = best_value; + return std::to_string(best_value); + } + else if (vtype == VariableType::doubletype) { + *((double*)(value)) = dvalues[best_index]; + return std::to_string(dvalues[best_index]); + } + else if (vtype == VariableType::longtype) { + *((long*)(value)) = lvalues[best_index]; + return std::to_string(lvalues[best_index]); + } + //else if (vtype == VariableType::stringtype) { + *((const char**)(value)) = svalues[best_index].c_str(); + return svalues[best_index]; + } + std::string toString() { + if (vtype == VariableType::continuous) { + return std::to_string(current_value); + } + if (vtype == VariableType::doubletype) { + return std::to_string(dvalues[current_index]); + } + else if (vtype == VariableType::longtype) { + return std::to_string(lvalues[current_index]); + } + //else if (vtype == VariableType::stringtype) { + return svalues[current_index]; + //} + } + double get_init(void) { + if (vtype == VariableType::continuous) { + // choose some point in the middle between the limits + return (dvalues[0] + dvalues[1]) / 2.0; + } + // otherwise, choose an index somewhere in the middle + //return ((double)maxlen / 2.0); + return 0.5; + } + const std::array& get_limits(void) { + // if our variable is continuous, we have been initialized with + // two values, the min and the max + if (vtype == VariableType::continuous) { + limits[0] = dvalues[0]; + limits[1] = dvalues[1]; + // if our variable is discrete, we will use the range from 0 to 1, + // and scale that value to the number of descrete values we have to get + // an index. + } else { + limits[0] = 0.0; + limits[1] = 1.0; + } + return limits; + } +}; + +class NelderMead { +private: + double cost; + double best_cost; + size_t kmax; + size_t k; + std::map vars; + const size_t max_iterations{500}; + const size_t min_iterations{16}; + internal::nelder_mead::Searcher* searcher; +public: + void evaluate(double new_cost); + NelderMead() : + kmax(0), k(1), searcher(nullptr) { + cost = std::numeric_limits::max(); + best_cost = cost; + } + ~NelderMead() { + if (searcher != nullptr) { + delete searcher; + } + } + bool converged() { + return searcher->converged(); + } + void getNewSettings(); + void saveBestSettings() { + for (auto& v : vars) { v.second.getBest(); } + } + void printBestSettings() { + std::string d("["); + for (auto v : vars) { + std::cout << d << v.second.getBest(); + d = ","; + } + std::cout << "]" << std::endl; + } + size_t get_max_iterations(); + std::map& get_vars() { return vars; } + void add_var(std::string name, Variable var) { + vars.insert(std::make_pair(name, var)); + } + void start(void); +}; + +} // nelder_mead + +} // apex diff --git a/src/apex/nelder_mead_internal.h b/src/apex/nelder_mead_internal.h new file mode 100644 index 00000000..f619bf93 --- /dev/null +++ b/src/apex/nelder_mead_internal.h @@ -0,0 +1,559 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace apex { +namespace internal { +namespace nelder_mead { + +template class Vec { + public: + Vec() {} + Vec(unsigned int n) : n(n) { + val.resize(n, 0); + } + Vec(std::initializer_list c) { + n = c.size(); + val.resize(n); + std::copy(c.begin().c.end(), val.begin()); + } + Vec(const Vec &lhs) { + val = lhs.val; + n = lhs.n; + } + Vec(const std::vector &lhs) { + val = lhs; + n = lhs.size(); + } + T operator()(unsigned int idx) const { + if (idx >= n) { + throw std::range_error("Element access out of range"); + } + return val[idx]; + } + T &operator()(unsigned int idx) { + if (idx >= n) { + throw std::range_error("Element access out of range"); + } + return val[idx]; + } + + Vec operator=(const Vec &rhs) { + val = rhs.val; + n = rhs.n; + return *this; + } + + Vec operator=(const std::vector &rhs) { + val = rhs; + n = rhs.size(); + return *this; + } + + Vec operator+(const Vec &rhs) const { + Vec lhs(n); + for (unsigned int i = 0; i < n; i++) { + lhs.val[i] = val[i] + rhs.val[i]; + } + return lhs; + } + Vec operator-(const Vec &rhs) const { + Vec lhs(n); + for (unsigned int i = 0; i < n; i++) { + lhs.val[i] = val[i] - rhs.val[i]; + } + return lhs; + } + + Vec operator/(T rhs) const { + Vec lhs(n); + for (unsigned int i = 0; i < n; i++) { + lhs.val[i] = val[i] / rhs; + } + return lhs; + } + Vec &operator+=(const Vec &rhs) { + if (n != rhs.n) + throw std::invalid_argument( + "The two vectors must have the same length"); + for (unsigned int i = 0; i < n; i++) { + val[i] += rhs.val[i]; + } + return *this; + } + + Vec &operator-=(const Vec &rhs) { + if (n != rhs.n) + throw std::invalid_argument( + "The two vectors must have the same length"); + for (unsigned int i = 0; i < n; i++) { + val[i] -= rhs.val[i]; + } + return *this; + } + + Vec &operator*=(T rhs) { + for (unsigned int i = 0; i < n; i++) { + val[i] *= rhs; + } + return *this; + } + + Vec &operator/=(T rhs) { + for (unsigned int i = 0; i < n; i++) { + val[i] /= rhs; + } + return *this; + } + unsigned int size() const { + return n; + } + unsigned int resize(unsigned int _n) { + val.resize(_n); + n = _n; + return n; + } + T length() const { + T ans = 0; + for (unsigned int i = 0; i < n; i++) { + ans += (val[i] * val[i]); + } + return std::sqrt(ans); + } + std::vector &vec() { + return val; + } + void enforce_min(std::vector limit) { + // don't exceed the limit! + for (unsigned int i = 0; i < n; i++) { + val[i] = std::max(val[i], limit[i]); + } + } + void enforce_max(std::vector limit) { + // don't exceed the limit! + for (unsigned int i = 0; i < n; i++) { + val[i] = std::min(val[i], limit[i]); + } + } + std::string to_string(void) { + std::stringstream ss; + ss << "["; + for (unsigned int i = 0; i < n; i++) { + ss << val[i]; + ss << (i == n - 1 ? "]" : ","); + } + return ss.str(); + } + friend Vec operator*(T a, const Vec &b) { + Vec c(b.size()); + for (unsigned int i = 0; i < b.size(); i++) { + c.val[i] = a * b.val[i]; + } + return c; + } + + private: + std::vector val; + unsigned int n; +}; + +template class Searcher { + unsigned int dimension; + bool adaptive; + T tol_fun; + T tol_x; + unsigned int max_iter; + unsigned int max_fun_evals; + T alpha; + T beta; + T gamma; + T delta; + std::vector> simplex; + unsigned int current_simplex_index; + std::vector> value_cache; + unsigned int biggest_idx; + unsigned int smallest_idx; + T biggest_val; + T second_biggest_val; + T smallest_val; + unsigned int niter; + std::vector res; + unsigned int func_evals_count; + std::vector minimum_limits; + std::vector maximum_limits; + Vec x_bar; // centroid point + Vec x_r; // reflection point + Vec x_e; // expansion point + Vec x_c; // contraction point + T reflection_val; + T expansion_val; + T contraction_val; + typedef enum { + SIMPLEX, // evaluating the simplex + REFLECTION, // evaluate the reflection + EXPANSION, // evaluate the expansion + CONTRACTION // evaluate the contraction + } Stage_t; + Stage_t stage; + bool _converged; + bool outside; + + public: + bool verbose; + Searcher(const std::vector &initial_point, + const std::vector> &initial_simplex = {}, + const std::vector &_minimum_limits = {}, + const std::vector &_maximum_limits = {}, bool _adaptive = true) + : adaptive(_adaptive), tol_fun(1e-8), tol_x(1e-8), max_iter(100000), + max_fun_evals(100000), current_simplex_index(0), + minimum_limits(_minimum_limits), maximum_limits(_maximum_limits), + _converged(false), verbose(false) { + initialize(initial_point, initial_simplex); + } + void function_tolerance(T tol) { + tol_fun = tol; + } + void point_tolerance(T tol) { + tol_x = tol; + } + void initialize(const std::vector &initial_point, + const std::vector> &initial_simplex) { + dimension = initial_point.size(); + assert(dimension > 0); + // Setting parameters + if (adaptive) { + // Using the results of doi:10.1007/s10589-010-9329-3 + alpha = 1; + beta = 1 + 2 / dimension; + gamma = 0.75 - 1 / (2 * dimension); + delta = 1 - 1 / dimension; + } else { + alpha = 1; + beta = 2; + gamma = 0.5; + delta = 0.5; + } + std::cout << alpha << " " << beta << " " << gamma << " " << delta + << std::endl; + simplex.resize(dimension + 1); + if (initial_simplex.empty()) { + // Generate initial simplex + simplex[0] = initial_point; + for (unsigned int i = 1; i <= dimension; i++) { + Vec p(initial_point); + T tau = (p(i - 1) < 1e-6 and p(i - 1) > -1e-6) ? 0.00025 : 0.05; + p(i - 1) += tau; + // enforce limits! + if (minimum_limits.size() == dimension) + p.enforce_min(minimum_limits); + if (maximum_limits.size() == dimension) + p.enforce_max(maximum_limits); + simplex[i] = p; + } + } else { + for (size_t i = 0 ; i < initial_simplex.size() ; i++) { + simplex[i] = initial_simplex[i]; + } + } + value_cache.resize(dimension + 1); + for (auto &v : value_cache) { + v.first = false; + } + biggest_idx = 0; + smallest_idx = 0; + niter = 0; + func_evals_count = 0; + stage = SIMPLEX; + } + std::vector &get_res(void) { + return res; + } + const std::vector &get_next_point(void) { + if (converged()) + return res; + switch (stage) { + case SIMPLEX: { + // return the next simplex point + return simplex[current_simplex_index].vec(); + break; + } + case REFLECTION: { + // Calculate the reflection point + x_r = x_bar + alpha * (x_bar - simplex[biggest_idx]); + // enforce limits! + if (minimum_limits.size() == dimension) + x_r.enforce_min(minimum_limits); + if (maximum_limits.size() == dimension) + x_r.enforce_max(maximum_limits); + // return the reflection point + return x_r.vec(); + break; + } + case EXPANSION: { + // Calculate the Expansion point + x_e = x_bar + beta * (x_r - x_bar); + // enforce limits! + if (minimum_limits.size() == dimension) + x_e.enforce_min(minimum_limits); + if (maximum_limits.size() == dimension) + x_e.enforce_max(maximum_limits); + // return the expansion point + return x_e.vec(); + break; + } + case CONTRACTION: { + // Compute the contraction point + outside = false; + // is the reflection better than the known worst? + if (reflection_val < biggest_val) { + // Outside contraction + outside = true; + x_c = x_bar + gamma * (x_r - x_bar); + // is the reflection worse than the known worst? + } else if (reflection_val >= biggest_val) { + // Inside contraction + x_c = x_bar - gamma * (x_r - x_bar); + } + // return the contraction point + return x_c.vec(); + break; + } + default: { + assert(false); + } + } + return res; + } + void report(T val) { + switch (stage) { + case SIMPLEX: + evaluate_simplex(val); + break; + case REFLECTION: // evaluate the reflection + // assign the value to the current reflection point + reflection_val = val; + evaluate_reflection(); + break; + case EXPANSION: // evaluate the expansion + // assign the value to the current expansion point + expansion_val = val; + evaluate_expansion(); + break; + case CONTRACTION: // evaluate the contraction + // assign the value to the current contraction point + contraction_val = val; + evaluate_contraction(); + break; + default: + assert(false); + break; + } + func_evals_count++; + return; + } + bool converged(void) { + return _converged; + } + void evaluate_simplex(T in_val) { + // if we are still getting values for the simplex points, return + if (current_simplex_index < value_cache.size()) { + // assign the value to the current simplex point + value_cache[current_simplex_index].first = true; + value_cache[current_simplex_index].second = in_val; + if (verbose) { + std::cout << "simplex " << current_simplex_index << " " + << simplex[current_simplex_index].to_string() << " = " + << in_val << std::endl; + } + current_simplex_index++; + if (current_simplex_index < value_cache.size()) { + return; + } + } + // simplex populated, check for convergence and prep for reflection + check_for_convergence(); + } + + void check_for_convergence(void) { + // if we have values for all the simplex points, find the + // best, worst, and second worst + T val = value_cache[0].second; + biggest_val = val; + smallest_val = val; + second_biggest_val = val; + biggest_idx = 0; + smallest_idx = 0; + // iterate over the other points in the simplex + for (unsigned int i = 1; i < simplex.size(); i++) { + // if we have a value, get it. + val = value_cache[i].second; + // is this value the biggest? + if (val > biggest_val) { + biggest_idx = i; + biggest_val = val; + // ...or is it the smallest? + } else if (val < smallest_val) { + smallest_idx = i; + smallest_val = val; + } + } + // at this point, we should know the biggest and smallest value + // and the simplexes that generated them + + // Calculate the difference of function values and the distance between + // points in the simplex, so that we can know when to stop the + // optimization + T max_val_diff = 0; + T max_point_diff = 0; + for (unsigned int i = 0; i < simplex.size(); i++) { + val = value_cache[i].second; + // find the second biggest value, save it for later reflection + if (i != biggest_idx and val > second_biggest_val) { + second_biggest_val = val; + // is this NOT the smallest value in the current set of simplex + // points? + } else if (i != smallest_idx) { + // how far is this point from the current best, and is it the + // furthest point from the best? + if (std::abs(val - smallest_val) > max_val_diff) { + max_val_diff = std::abs(val - smallest_val); + } + // get the manhattan distance of the vector between this point + // and the smallest one we have seen so far + T diff = (simplex[i] - simplex[smallest_idx]).length(); + // how "far" is the point from the smallest point? + if (diff > max_point_diff) { + max_point_diff = diff; + } + } + } + // have we converged? either by being within tolerance of the best - + // worst or by being within the tolerance of a point distance? + if ((max_val_diff <= tol_fun or max_point_diff <= tol_x) or + (func_evals_count >= max_fun_evals) or (niter >= max_iter)) { + res = simplex[smallest_idx].vec(); + std::cout << "Converged after " << niter << " iterations." + << std::endl; + std::cout << "Total func evaluations: " << func_evals_count + << std::endl; + _converged = true; + return; + /* + } else { + std::cout << "Not converged: " << max_val_diff << " value difference." + << std::endl; + std::cout << "Not converged: " << max_point_diff << " point difference." + << std::endl; + */ + } + + // not converged? + // Calculate the centroid of our current set + x_bar.resize(dimension); + for (unsigned int i = 0; i < dimension; i++) + x_bar(i) = 0; + for (unsigned int i = 0; i < simplex.size(); i++) { + if (i != biggest_idx) + x_bar += simplex[i]; + } + x_bar /= dimension; + + // advance to REFLECTION + stage = REFLECTION; + return; + } + void evaluate_reflection(void) { + niter++; + if (verbose) { + std::cout << "reflection = " << x_r.to_string() << " " + << reflection_val << std::endl; + } + // is it better than our current best? + if (reflection_val < smallest_val) { + stage = EXPANSION; + return; + } else if (reflection_val >= second_biggest_val) { + stage = CONTRACTION; + return; + } else { + // Reflection is good enough + // replace the worst point with the reflection point. + simplex[biggest_idx] = x_r; + value_cache[biggest_idx].second = reflection_val; + // simplex populated, check for convergence and prep for reflection + check_for_convergence(); + } + // reflection fell through, so stay in reflection stage - but get a new + // point + stage = REFLECTION; + } + void evaluate_expansion(void) { + // evaluate the expansion point + if (verbose) { + std::cout << "expansion = " << x_e.to_string() << " " + << expansion_val << std::endl; + } + // is the expansion point better than the reflection point? + if (expansion_val < reflection_val) { + // replace the worst simplex point with our new expansion point + simplex[biggest_idx] = x_e; + value_cache[biggest_idx].second = expansion_val; + // simplex populated, check for convergence and prep for reflection + check_for_convergence(); + } else { + // otherwise, replace our worst simplex with our reflection point + simplex[biggest_idx] = x_r; + value_cache[biggest_idx].second = reflection_val; + // simplex populated, check for convergence and prep for reflection + check_for_convergence(); + } + if (current_simplex_index > dimension) { + stage = REFLECTION; + } else { + // stage = SIMPLEX; + stage = REFLECTION; + } + } + void evaluate_contraction(void) { + // evaluate the contraction point + if (verbose) { + std::cout << "contraction = " << x_c.to_string() << " " + << contraction_val << std::endl; + } + // is the contraction better than the reflection or the known worst? + if ((outside and contraction_val <= reflection_val) or + (not outside and contraction_val <= biggest_val)) { + // replace the known worst with the contraction + simplex[biggest_idx] = x_c; + value_cache[biggest_idx].second = contraction_val; + // simplex populated, check for convergence and prep for reflection + check_for_convergence(); + stage = REFLECTION; + } else { + // Shrinking + if (verbose) { + std::cout << "shrinking" << std::endl; + } + // we take the whole simplex, and move every point towards the + // current best candidate + for (unsigned int i = 0; i < dimension; i++) { + if (i != smallest_idx) { + simplex[i] = simplex[smallest_idx] + + delta * (simplex[i] - simplex[smallest_idx]); + value_cache[i].first = false; + } + } + // have to re-evaluate the new simplex, so reset. + current_simplex_index = 0; + stage = SIMPLEX; + } + } +}; + +}; // namespace nelder_mead +}; // namespace internal +}; // namespace apex diff --git a/src/unit_tests/Kokkos/CMakeLists.txt b/src/unit_tests/Kokkos/CMakeLists.txt index 39c86f5e..7ec3c7aa 100644 --- a/src/unit_tests/Kokkos/CMakeLists.txt +++ b/src/unit_tests/Kokkos/CMakeLists.txt @@ -10,6 +10,7 @@ link_directories (${APEX_BINARY_DIR}/src/apex) set(example_programs simple two_var + mm2d_tiling ) foreach(example_program ${example_programs}) diff --git a/src/unit_tests/Kokkos/mm2d_tiling.cpp b/src/unit_tests/Kokkos/mm2d_tiling.cpp new file mode 100644 index 00000000..42aada15 --- /dev/null +++ b/src/unit_tests/Kokkos/mm2d_tiling.cpp @@ -0,0 +1,325 @@ +#include + +#include + +#include +#include // cbrt +#include +#include +#include +#include +#include +#include +#include + +const int M=128; +const int N=128; +const int P=128; +const std::string mm2D{"mm2D"}; +enum schedulers{StaticSchedule, DynamicSchedule}; +static const std::string scheduleNames[] = {"static", "dynamic"}; + +using matrix2d = Kokkos::View; +namespace KTE = Kokkos::Tools::Experimental; +namespace KE = Kokkos::Experimental; +constexpr int lowerBound{100}; +constexpr int upperBound{999}; + +// Helper function to generate tile sizes +std::vector factorsOf(const int &size){ + std::vector factors; + for(int i=1; i makeRange(const int &size){ + std::vector range; + for(int i=2; i<=size; i+=2){ + range.push_back(i); + } + return range; +} + +// helper function for human output +void reportOptions(std::vector& candidates, + std::string name, size_t size) { + std::cout<<"Tiling options for " << name << "="< candidates = factorsOf(limit); + reportOptions(candidates, name, limit); + // create our variable object + KTE::VariableInfo out_info; + // set the variable details + out_info.type = KTE::ValueType::kokkos_value_int64; + out_info.category = KTE::StatisticalCategory::kokkos_value_categorical; + out_info.valueQuantity = KTE::CandidateValueType::kokkos_value_set; + out_info.candidates = KTE::make_candidate_set(candidates.size(),candidates.data()); + // declare the variable + out_value_id = KTE::declare_output_type(varname,out_info); + // return the id + return out_value_id; +} + +// helper function for declaring input size variables +size_t declareInputViewSize(std::string varname, int64_t size) { + size_t in_value_id; + // create a 'vector' of value(s) + std::vector candidates = {size}; + // create our variable object + KTE::VariableInfo in_info; + // set the variable details + in_info.type = KTE::ValueType::kokkos_value_int64; + in_info.category = KTE::StatisticalCategory::kokkos_value_categorical; + in_info.valueQuantity = KTE::CandidateValueType::kokkos_value_set; + in_info.candidates = KTE::make_candidate_set(candidates.size(),candidates.data()); + // declare the variable + in_value_id = KTE::declare_input_type(varname,in_info); + // return the id + return in_value_id; +} + +// helper function for declaring scheduler variable +size_t declareOutputSchedules(std::string varname) { + // create a vector of potential values + std::vector candidates_schedule = {StaticSchedule,DynamicSchedule}; + // create our variable object + KTE::VariableInfo schedule_out_info; + // set the variable details + schedule_out_info.type = KTE::ValueType::kokkos_value_int64; + schedule_out_info.category = KTE::StatisticalCategory::kokkos_value_categorical; + schedule_out_info.valueQuantity = KTE::CandidateValueType::kokkos_value_set; + schedule_out_info.candidates = KTE::make_candidate_set(candidates_schedule.size(),candidates_schedule.data()); + // declare the variable + size_t schedule_out_value_id = KTE::declare_output_type(varname,schedule_out_info); + // return the id + return schedule_out_value_id; +} + +// helper function for declaring output tread count variable +size_t declareOutputThreadCount(std::string varname, size_t limit) { + size_t out_value_id; + // create a vector of potential values + std::vector candidates = makeRange(limit); + // create our variable object + KTE::VariableInfo out_info; + // set the variable details + out_info.type = KTE::ValueType::kokkos_value_int64; + out_info.category = KTE::StatisticalCategory::kokkos_value_categorical; + out_info.valueQuantity = KTE::CandidateValueType::kokkos_value_set; + out_info.candidates = KTE::make_candidate_set(candidates.size(),candidates.data()); + // declare the variable + out_value_id = KTE::declare_output_type(varname,out_info); + // return the id + return out_value_id; +} + +int main(int argc, char *argv[]){ + // surely there is a way to get this from Kokkos? + bool tuning = false; + char * tmp{getenv("APEX_KOKKOS_TUNING")}; + if (tmp != nullptr) { + std::string tmpstr {tmp}; + if (tmpstr.compare("1") == 0) { + tuning = true; + } + } + // initialize Kokkos + Kokkos::initialize(argc, argv); + { + // print the Kokkos configuration + Kokkos::print_configuration(std::cout, false); + // seed the random number generator, sure + srand(time(0)); + + /* Declare/Init re,ar1,ar2 */ + matrix2d ar1("array1",M,N), ar2("array2",N,P), re("Result",M,P); + initArray(ar1, M, N); + initArray(ar2, N, P); + zeroArray(re, M, P); + + // Context variable setup - needed to generate a unique context hash for tuning. + // Declare the variables and store the variable IDs + size_t id[5]; + id[0] = 1; // default input for the region name ("mm2D") + id[1] = 2; // default input for the region type ("parallel_for") + id[2] = declareInputViewSize("matrix_size_M", M); + id[3] = declareInputViewSize("matrix_size_N", N); + id[4] = declareInputViewSize("matrix_size_P", P); + + // create an input vector of variables with name, loop type, and view sizes. + std::vector input_vector{ + KTE::make_variable_value(id[0], mm2D), + KTE::make_variable_value(id[1], "parallel_for"), + KTE::make_variable_value(id[2], int64_t(M)), + KTE::make_variable_value(id[3], int64_t(N)), + KTE::make_variable_value(id[4], int64_t(P)) + }; + + // Declare the variables and store the variable IDs + size_t out_value_id[5]; + + // Tuning tile size - setup + out_value_id[0] = declareOutputTileSize("M", "ti_out", M); + out_value_id[1] = declareOutputTileSize("N", "tj_out", N); + out_value_id[2] = declareOutputTileSize("P", "tk_out", P); + // Tuning tile size - end setup + + // scheduling policy - setup + out_value_id[3] = declareOutputSchedules("schedule_out"); + // scheduling policy - end setup + + // thread count - setup + int64_t max_threads = std::min(std::thread::hardware_concurrency(), + (unsigned int)(Kokkos::OpenMP::concurrency())); + out_value_id[4] = declareOutputThreadCount("thread_count", max_threads); + // thread count - end setup + + //The second argument to make_varaible_value might be a default value + std::vector answer_vector{ + KTE::make_variable_value(out_value_id[0], int64_t(1)), + KTE::make_variable_value(out_value_id[1], int64_t(1)), + KTE::make_variable_value(out_value_id[2], int64_t(1)), + KTE::make_variable_value(out_value_id[3], int64_t(StaticSchedule)), + KTE::make_variable_value(out_value_id[4], int64_t(max_threads)) + }; + + /* Declare the kernel that does the work */ + const auto kernel = KOKKOS_LAMBDA(int i, int j, int k){ + re(i,j) += ar1(i,j) * ar2(j,k); + }; + + /* Iterate max_iterations times, so that we can explore the search + * space. Not all searches will converge - we have a large space! + * It's likely that exhaustive search will fail to converge. */ + for (int i = 0 ; i < Impl::max_iterations ; i++) { + // request a context id + size_t context = KTE::get_new_context_id(); + // start the context + KTE::begin_context(context); + + // set the input values for the context + KTE::set_input_values(context, input_vector.size(), input_vector.data()); + // request new output values for the context + KTE::request_output_values(context, answer_vector.size(), answer_vector.data()); + + // get the tiling factors + int ti,tj,tk; + ti = answer_vector[0].value.int_value; + tj = answer_vector[1].value.int_value; + tk = answer_vector[2].value.int_value; + // get our schedule and thread count + int scheduleType = answer_vector[3].value.int_value; + // there's probably a better way to set the thread count? + int num_threads = answer_vector[4].value.int_value; + int leftover_threads = max_threads - answer_vector[4].value.int_value; + + // no tuning? + if (!tuning) { + // Report the tuning, if desired + /* + std::cout << "Tiling: default, "; + std::cout << "Schedule: default "; + std::cout << "Threads: default "; + std::cout << std::endl; + */ + + // default scheduling policy, default tiling + Kokkos::MDRangePolicy> default_policy({0,0,0},{M,N,P}); + Kokkos::parallel_for( + mm2D, default_policy, KOKKOS_LAMBDA(int i, int j, int k){ + re(i,j) += ar1(i,j) * ar2(j,k); + } + ); + // use static schedule? + } else if (scheduleType == StaticSchedule) { + // Report the tuning, if desired + /* + std::cout << "Tiling: [" << ti << "," << tj << "," << tk << "], "; + std::cout << "Schedule: " << scheduleNames[scheduleType] << ", "; + std::cout << "Threads: " << answer_vector[4].value.int_value; + std::cout << std::endl; + */ + + // if using max threads, no need to partition + if (num_threads == max_threads) { + // static scheduling, tuned tiling + Kokkos::MDRangePolicy, + Kokkos::Rank<3>> static_policy({0,0,0},{M,N,P},{ti,tj,tk}); + Kokkos::parallel_for(mm2D, static_policy, kernel); + } else { + // partition the space so we can tune the number of threads + auto instances = KE::partition_space(Kokkos::OpenMP(), + num_threads, leftover_threads); + // static scheduling, tuned tiling + Kokkos::MDRangePolicy, + Kokkos::Rank<3>> static_policy(instances[0],{0,0,0},{M,N,P},{ti,tj,tk}); + Kokkos::parallel_for(mm2D, static_policy, kernel); + } + } else { + /* + // Report the tuning, if desired + std::cout << "Tiling: [" << ti << "," << tj << "," << tk << "], "; + std::cout << "Schedule: " << scheduleNames[scheduleType] << ", "; + std::cout << "Threads: " << answer_vector[4].value.int_value; + std::cout << std::endl; + */ + + // if using max threads, no need to partition + if (num_threads == max_threads) { + // dynamic scheduling, tuned tiling + Kokkos::MDRangePolicy, + Kokkos::Rank<3>> dynamic_policy({0,0,0},{M,N,P},{ti,tj,tk}); + Kokkos::parallel_for( + mm2D, dynamic_policy, kernel); + } else { + // partition the space so we can tune the number of threads + auto instances = KE::partition_space(Kokkos::OpenMP(), + num_threads, leftover_threads); + // dynamic scheduling, tuned tiling + Kokkos::MDRangePolicy, + Kokkos::Rank<3>> dynamic_policy(instances[0],{0,0,0},{M,N,P},{ti,tj,tk}); + Kokkos::parallel_for( + mm2D, dynamic_policy, kernel); + } + } + // end the context + KTE::end_context(context); + } + } + Kokkos::finalize(); +} \ No newline at end of file diff --git a/src/unit_tests/Kokkos/tuning_playground.hpp b/src/unit_tests/Kokkos/tuning_playground.hpp index ad496d1d..94a724e4 100644 --- a/src/unit_tests/Kokkos/tuning_playground.hpp +++ b/src/unit_tests/Kokkos/tuning_playground.hpp @@ -3,9 +3,12 @@ #include #include +#include namespace Impl { +constexpr const int max_iterations{1000}; + struct empty {}; template typename TupleLike, @@ -40,12 +43,10 @@ auto setup_helper(const Setup &setup, int num_iters, std::true_type) { template void tuned_kernel(int argc, char* argv[], Setup setup, Tunable tunable){ - int num_iters = 100000; - bool tuned_internals; - bool found_tuning_tool; - bool print_progress; + int num_iters = 1000; Kokkos::initialize(argc, argv); { + Kokkos::print_configuration(std::cout, false); using emptiness = typename std::is_same::type; auto kernel_data = Impl::setup_helper(setup, num_iters, emptiness{}); @@ -74,7 +75,7 @@ size_t create_categorical_int_tuner(std::string name, size_t num_options){ info.type = ValueType::kokkos_value_int64; info.valueQuantity = CandidateValueType::kokkos_value_set; std::vector options; - for(int x=0;x -void fastest_of(const std::string& label, Implementations... implementations){ +void fastest_of(const std::string& label, const size_t count, Implementations... implementations){ using namespace Kokkos::Tools::Experimental; auto tuner_iter = [&]() { auto my_tuner = ids_for_kernels.find(label); @@ -111,11 +112,19 @@ void fastest_of(const std::string& label, Implementations... implementations){ auto input_id = create_fastest_implementation_id(); VariableValue picked_implementation = make_variable_value(var_id,int64_t(0)); VariableValue which_kernel = make_variable_value(var_id,label.c_str()); + which_kernel.value.int_value = -1; auto context_id = get_new_context_id(); begin_context(context_id); - set_input_values(context_id, 1, &which_kernel); - request_output_values(context_id, 1, &picked_implementation); - fastest_of_helper(picked_implementation.value.int_value, implementations...); + set_input_values(context_id, 1, &picked_implementation); + request_output_values(context_id, 1, &which_kernel); + // if we didn't get a prediction, just alternate between methods. + if (which_kernel.value.int_value < 0) { + static int flipper{0}; + fastest_of_helper(flipper, implementations...); + flipper = (flipper + 1) % count; + } else { + fastest_of_helper(which_kernel.value.int_value, implementations...); + } end_context(context_id); }