Skip to content

Commit 801766e

Browse files
acreyesCloud Userlroberts36
authored
Enable parthenon::par_reduce for MD loops with Kokkos 1D Range (#1130)
* wrap 3D flat loop abstractions * add 4D loop and test * add specialization for const int & * added mdrange loops to par_reduce tests * refactor flatloop specialization * clean up * formatting * linting * templating functor index types * moved to a single functor * Update CHANGELOG.md * remove unused Nj --------- Co-authored-by: Cloud User <[email protected]> Co-authored-by: Luke Roberts <[email protected]>
1 parent 2e0e981 commit 801766e

File tree

3 files changed

+137
-30
lines changed

3 files changed

+137
-30
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
## Current develop
44

55
### Added (new features/APIs/variables/...)
6+
- [[PR 1130]](https://github.com/parthenon-hpc-lab/parthenon/pull/1130) Enable `parthenon::par_reduce` for MD loops with Kokkos 1D Range
67
- [[PR 1119]](https://github.com/parthenon-hpc-lab/parthenon/pull/1119) Formalize MeshData partitioning.
78
- [[PR 1128]](https://github.com/parthenon-hpc-lab/parthenon/pull/1128) Add cycle and nbtotal to hst
89
- [[PR 1099]](https://github.com/parthenon-hpc-lab/parthenon/pull/1099) Functionality for outputting task graphs in GraphViz format.

src/kokkos_abstraction.hpp

+59-26
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,35 @@ par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_spac
258258
function, std::forward<Args>(args)...);
259259
}
260260

261+
template <typename, typename>
262+
class FlatFunctor;
263+
264+
template <typename F, typename... Args>
265+
auto MakeFlatFunctor(F &function, Args... args) {
266+
return FlatFunctor<F, decltype(&F::operator())>(function, std::forward<Args>(args)...);
267+
}
268+
269+
template <typename Function, typename R, typename T, typename Index, typename... FArgs>
270+
class FlatFunctor<Function, R (T::*)(Index, Index, Index, FArgs...) const> {
271+
int NjNi, Ni, kl, jl, il;
272+
Function function;
273+
274+
public:
275+
FlatFunctor(const Function _function, const int _NjNi, const int _Ni, const int _kl,
276+
const int _jl, const int _il)
277+
: function(_function), NjNi(_NjNi), Ni(_Ni), kl(_kl), jl(_jl), il(_il) {}
278+
KOKKOS_INLINE_FUNCTION
279+
void operator()(const int &idx, FArgs &&...fargs) const {
280+
int k = idx / NjNi;
281+
int j = (idx - k * NjNi) / Ni;
282+
int i = idx - k * NjNi - j * Ni;
283+
k += kl;
284+
j += jl;
285+
i += il;
286+
function(k, j, i, std::forward<FArgs>(fargs)...);
287+
}
288+
};
289+
261290
// 3D loop using Kokkos 1D Range
262291
template <typename Tag, typename Function, class... Args>
263292
inline typename std::enable_if<sizeof...(Args) <= 1, void>::type
@@ -270,18 +299,9 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp
270299
const int Ni = iu - il + 1;
271300
const int NkNjNi = Nk * Nj * Ni;
272301
const int NjNi = Nj * Ni;
273-
kokkos_dispatch(
274-
tag, name, Kokkos::RangePolicy<>(exec_space, 0, NkNjNi),
275-
KOKKOS_LAMBDA(const int &idx) {
276-
int k = idx / NjNi;
277-
int j = (idx - k * NjNi) / Ni;
278-
int i = idx - k * NjNi - j * Ni;
279-
k += kl;
280-
j += jl;
281-
i += il;
282-
function(k, j, i);
283-
},
284-
std::forward<Args>(args)...);
302+
kokkos_dispatch(tag, name, Kokkos::RangePolicy<>(exec_space, 0, NkNjNi),
303+
MakeFlatFunctor(function, NjNi, Ni, kl, jl, il),
304+
std::forward<Args>(args)...);
285305
}
286306

287307
// 3D loop using MDRange loops
@@ -372,6 +392,30 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name,
372392
function(k, j, i);
373393
}
374394

395+
template <typename Function, typename R, typename T, typename Index, typename... FArgs>
396+
class FlatFunctor<Function, R (T::*)(Index, Index, Index, Index, FArgs...) const> {
397+
int NkNjNi, NjNi, Ni, nl, kl, jl, il;
398+
Function function;
399+
400+
public:
401+
FlatFunctor(const Function _function, const int _NkNjNi, const int _NjNi, const int _Ni,
402+
const int _nl, const int _kl, const int _jl, const int _il)
403+
: function(_function), NkNjNi(_NkNjNi), NjNi(_NjNi), Ni(_Ni), nl(_nl), kl(_kl),
404+
jl(_jl), il(_il) {}
405+
KOKKOS_INLINE_FUNCTION
406+
void operator()(const int &idx, FArgs &&...fargs) const {
407+
int n = idx / NkNjNi;
408+
int k = (idx - n * NkNjNi) / NjNi;
409+
int j = (idx - n * NkNjNi - k * NjNi) / Ni;
410+
int i = idx - n * NkNjNi - k * NjNi - j * Ni;
411+
n += nl;
412+
k += kl;
413+
j += jl;
414+
i += il;
415+
function(n, k, j, i, std::forward<FArgs>(fargs)...);
416+
}
417+
};
418+
375419
// 4D loop using Kokkos 1D Range
376420
template <typename Tag, typename Function, class... Args>
377421
inline typename std::enable_if<sizeof...(Args) <= 1, void>::type
@@ -387,20 +431,9 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp
387431
const int NnNkNjNi = Nn * Nk * Nj * Ni;
388432
const int NkNjNi = Nk * Nj * Ni;
389433
const int NjNi = Nj * Ni;
390-
kokkos_dispatch(
391-
tag, name, Kokkos::RangePolicy<>(exec_space, 0, NnNkNjNi),
392-
KOKKOS_LAMBDA(const int &idx) {
393-
int n = idx / NkNjNi;
394-
int k = (idx - n * NkNjNi) / NjNi;
395-
int j = (idx - n * NkNjNi - k * NjNi) / Ni;
396-
int i = idx - n * NkNjNi - k * NjNi - j * Ni;
397-
n += nl;
398-
k += kl;
399-
j += jl;
400-
i += il;
401-
function(n, k, j, i);
402-
},
403-
std::forward<Args>(args)...);
434+
kokkos_dispatch(tag, name, Kokkos::RangePolicy<>(exec_space, 0, NnNkNjNi),
435+
MakeFlatFunctor(function, NkNjNi, NjNi, Ni, nl, kl, jl, il),
436+
std::forward<Args>(args)...);
404437
}
405438

406439
// 4D loop using MDRange loops

tst/unit/kokkos_abstraction.cpp

+77-4
Original file line numberDiff line numberDiff line change
@@ -500,12 +500,85 @@ bool test_wrapper_reduce_1d(T loop_pattern, DevExecSpace exec_space) {
500500
return total == test_tot;
501501
}
502502

503+
template <class T>
504+
bool test_wrapper_reduce_3d(T loop_pattern, DevExecSpace exec_space) {
505+
constexpr int N = 10;
506+
parthenon::ParArray3D<int> buffer("Testing buffer", N, N, N);
507+
// Initialize data
508+
parthenon::par_for(
509+
loop_pattern, "Initialize parallel reduce array", exec_space, 0, N - 1, 0, N - 1, 0,
510+
N - 1, KOKKOS_LAMBDA(const int k, const int j, const int i) {
511+
buffer(k, j, i) = i + j + k;
512+
});
513+
int tot = 0;
514+
for (int k = 0; k < N; ++k) {
515+
for (int j = 0; j < N; ++j) {
516+
for (int i = 0; i < N; ++i) {
517+
tot += i + j + k;
518+
}
519+
}
520+
}
521+
int test_tot = 0;
522+
parthenon::par_reduce(
523+
loop_pattern, "Sum via par reduce", exec_space, 0, N - 1, 0, N - 1, 0, N - 1,
524+
KOKKOS_LAMBDA(const int k, const int j, const int i, int &t) { t += i + j + k; },
525+
Kokkos::Sum<int>(test_tot));
526+
return tot == test_tot;
527+
}
528+
529+
template <class T>
530+
bool test_wrapper_reduce_4d(T loop_pattern, DevExecSpace exec_space) {
531+
constexpr int N = 10;
532+
parthenon::ParArray4D<int> buffer("Testing buffer", N, N, N, N);
533+
// Initialize data
534+
parthenon::par_for(
535+
loop_pattern, "Initialize parallel reduce array", exec_space, 0, N - 1, 0, N - 1, 0,
536+
N - 1, 0, N - 1, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
537+
buffer(n, k, j, i) = i + j + k + n;
538+
});
539+
int tot = 0;
540+
for (int n = 0; n < N; ++n) {
541+
for (int k = 0; k < N; ++k) {
542+
for (int j = 0; j < N; ++j) {
543+
for (int i = 0; i < N; ++i) {
544+
tot += i + j + k + n;
545+
}
546+
}
547+
}
548+
}
549+
int test_tot = 0;
550+
parthenon::par_reduce(
551+
loop_pattern, "Sum via par reduce", exec_space, 0, N - 1, 0, N - 1, 0, N - 1, 0,
552+
N - 1,
553+
KOKKOS_LAMBDA(const int n, const int k, const int j, const int i, int &t) {
554+
t += i + j + k + n;
555+
},
556+
Kokkos::Sum<int>(test_tot));
557+
return tot == test_tot;
558+
}
559+
503560
TEST_CASE("Parallel reduce", "[par_reduce]") {
504561
auto default_exec_space = DevExecSpace();
505-
REQUIRE(test_wrapper_reduce_1d(parthenon::loop_pattern_flatrange_tag,
506-
default_exec_space) == true);
507-
if constexpr (std::is_same<DevExecSpace, Kokkos::Serial>::value) {
508-
REQUIRE(test_wrapper_reduce_1d(parthenon::loop_pattern_simdfor_tag,
562+
SECTION("1D loops") {
563+
REQUIRE(test_wrapper_reduce_1d(parthenon::loop_pattern_flatrange_tag,
564+
default_exec_space) == true);
565+
if constexpr (std::is_same<DevExecSpace, Kokkos::Serial>::value) {
566+
REQUIRE(test_wrapper_reduce_1d(parthenon::loop_pattern_simdfor_tag,
567+
default_exec_space) == true);
568+
}
569+
}
570+
571+
SECTION("3D loops") {
572+
REQUIRE(test_wrapper_reduce_3d(parthenon::loop_pattern_flatrange_tag,
573+
default_exec_space) == true);
574+
REQUIRE(test_wrapper_reduce_3d(parthenon::loop_pattern_mdrange_tag,
575+
default_exec_space) == true);
576+
}
577+
578+
SECTION("4D loops") {
579+
REQUIRE(test_wrapper_reduce_4d(parthenon::loop_pattern_flatrange_tag,
580+
default_exec_space) == true);
581+
REQUIRE(test_wrapper_reduce_4d(parthenon::loop_pattern_mdrange_tag,
509582
default_exec_space) == true);
510583
}
511584
}

0 commit comments

Comments
 (0)