From 227730a1259d68ac494fd754b575a588a04e97b1 Mon Sep 17 00:00:00 2001 From: Mathias Louboutin Date: Mon, 18 Sep 2023 11:56:45 -0400 Subject: [PATCH] compiler: prevent halo to be moved outside their iteration space --- devito/ir/stree/algorithms.py | 6 ++++++ devito/mpi/halo_scheme.py | 4 ++++ devito/passes/iet/langbase.py | 16 ++++++++++------ devito/passes/iet/parpragma.py | 10 ++++++++-- tests/test_dle.py | 10 ++++++---- tests/test_gpu_common.py | 6 +++--- tests/test_gpu_openacc.py | 16 ++++++++-------- tests/test_gpu_openmp.py | 2 +- tests/test_mpi.py | 3 ++- 9 files changed, 48 insertions(+), 25 deletions(-) diff --git a/devito/ir/stree/algorithms.py b/devito/ir/stree/algorithms.py index 58e8e844e69..36505e9e617 100644 --- a/devito/ir/stree/algorithms.py +++ b/devito/ir/stree/algorithms.py @@ -147,6 +147,12 @@ def preprocess(clusters, options=None, **kwargs): found = [] for c1 in list(queue): distributed_aindices = c1.halo_scheme.distributed_aindices + h_indices = set().union(*[(d, d.root) + for d in c1.halo_scheme.loc_indices]) + + # Skip if the Halo echange would end up outside its need iteration space + if not h_indices & dims: + continue diff = dims - distributed_aindices intersection = dims & distributed_aindices diff --git a/devito/mpi/halo_scheme.py b/devito/mpi/halo_scheme.py index 0204c171e67..970e84633dc 100644 --- a/devito/mpi/halo_scheme.py +++ b/devito/mpi/halo_scheme.py @@ -361,6 +361,10 @@ def distributed(self): def distributed_aindices(self): return set().union(*[i.dims for i in self.fmapper.values()]) + @cached_property + def loc_indices(self): + return set().union(*[i.loc_indices.keys() for i in self.fmapper.values()]) + @cached_property def arguments(self): return self.dimensions | set(flatten(self.honored.values())) diff --git a/devito/passes/iet/langbase.py b/devito/passes/iet/langbase.py index 4a4f6ac4653..77c4ad1a3d7 100644 --- a/devito/passes/iet/langbase.py +++ b/devito/passes/iet/langbase.py @@ -214,8 +214,8 @@ def DeviceIteration(self): def Prodder(self): return self.lang.Prodder - def _is_offloadable(self, *args, **kwargs): - return False + def _n_device_pointers(self, *args, **kwargs): + return 0 class DeviceAwareMixin(object): @@ -328,6 +328,13 @@ def _(iet): return _initialize(iet) + def _n_device_pointers(self, iet): + functions = FindSymbols().visit(iet) + buffers = [f for f in functions if f.is_Array and f._mem_mapped] + hostfuncs = [f for f in functions if not is_on_device(f, self.gpu_fit)] + + return len(functions) - len(buffers + hostfuncs) + def _is_offloadable(self, iet): """ True if the IET computation is offloadable to device, False otherwise. @@ -336,10 +343,7 @@ def _is_offloadable(self, iet): if any(not is_on_device(e.write, self.gpu_fit) for e in expressions): return False - functions = FindSymbols().visit(iet) - buffers = [f for f in functions if f.is_Array and f._mem_mapped] - hostfuncs = [f for f in functions if not is_on_device(f, self.gpu_fit)] - return not (buffers and hostfuncs) + return self._n_device_pointers(iet) > 0 class Sections(tuple): diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index 9d69e12df7f..496cd80f744 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -295,13 +295,13 @@ def _select_candidates(self, candidates): except TypeError: pass - collapsable.append(i) + collapsable.append(i) # Give a score to this candidate, based on the number of fully-parallel # Iterations and their position (i.e. outermost to innermost) in the nest score = ( int(root.is_ParallelNoAtomic), - -int(self._is_offloadable(root))*(n0 + 1), # Outermost offloadable + -self._n_device_pointers(root), # Outermost offloadable int(len([i for i in collapsable if i.is_ParallelNoAtomic]) >= 1), int(len([i for i in collapsable if i.is_ParallelRelaxed]) >= 1), -(n0 + 1) # The outermost, the better @@ -375,6 +375,12 @@ def _make_partree(self, candidates, nthreads=None): ncollapsed=ncollapsed, nthreads=nthreads, **root.args) prefix = [] + elif all(i.is_ParallelRelaxed for i in candidates) and nthreads is not None: + body = self.HostIteration(schedule='static', + parallel=nthreads is not self.nthreads_nested, + ncollapsed=ncollapsed, nthreads=nthreads, + **root.args) + prefix = [] else: # pragma ... for ... schedule(..., expr) assert nthreads is None diff --git a/tests/test_dle.py b/tests/test_dle.py index 3b9883e6652..df3c4adfa58 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -291,7 +291,7 @@ def test_cache_blocking_structure_optrelax_prec_inject(): 'openmp': True, 'par-collapse-ncores': 1})) - assert_structure(op, ['t', 't,p_s0_blk0,p_s', 't,p_s0_blk0,p_s,rsx,rsy'], + assert_structure(op, ['t,p_s0_blk0,p_s', 't,p_s0_blk0,p_s,rsx,rsy'], 't,p_s0_blk0,p_s,rsx,rsy') @@ -821,12 +821,13 @@ def test_incs_no_atomic(self): 'par-collapse-ncores': 1, 'par-collapse-work': 0})) - assert 'collapse(2)' in str(op0) + assert 'collapse(3)' in str(op0) assert 'atomic' in str(op0) # Now only `x` is parallelized op1 = Operator([Eq(v[t, x, 0, 0], v[t, x, 0, 0] + 1), Inc(uf, 1)], opt=('advanced', {'openmp': True, 'par-collapse-ncores': 1})) + assert 'omp for' in str(op1) assert 'collapse' not in str(op1) assert 'atomic' not in str(op1) @@ -951,11 +952,12 @@ def test_parallel_prec_inject(self): eqns = sf.inject(field=u.forward, expr=sf * dt**2) op0 = Operator(eqns, opt=('advanced', {'openmp': True, - 'par-collapse-ncores': 1})) + 'par-collapse-ncores': 20})) iterations = FindNodes(Iteration).visit(op0) assert not iterations[0].pragmas assert 'omp for' in iterations[1].pragmas[0].value + assert 'collapse' not in iterations[1].pragmas[0].value op0 = Operator(eqns, opt=('advanced', {'openmp': True, 'par-collapse-ncores': 1, @@ -963,7 +965,7 @@ def test_parallel_prec_inject(self): iterations = FindNodes(Iteration).visit(op0) assert not iterations[0].pragmas - assert 'omp for' in iterations[1].pragmas[0].value + assert 'omp for collapse' in iterations[2].pragmas[0].value class TestNestedParallelism(object): diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py index 031bd9181ba..628c04f099a 100644 --- a/tests/test_gpu_common.py +++ b/tests/test_gpu_common.py @@ -97,9 +97,9 @@ def test_fission(self): assert trees[0].root is trees[1].root assert trees[0][1] is not trees[1][1] assert trees[0].root.dim is time - assert not trees[0].root.pragmas - assert trees[0][1].pragmas - assert trees[1][1].pragmas + assert trees[0].root.pragmas + assert not trees[0][1].pragmas + assert not trees[0][2].pragmas op.apply() diff --git a/tests/test_gpu_openacc.py b/tests/test_gpu_openacc.py index 823d11854de..db92db3c83f 100644 --- a/tests/test_gpu_openacc.py +++ b/tests/test_gpu_openacc.py @@ -102,15 +102,15 @@ def test_tile_insteadof_collapse(self, par_tile): opt=('advanced', {'par-tile': par_tile})) trees = retrieve_iteration_tree(op) - assert len(trees) == 4 + assert len(trees) == 6 - assert trees[0][1].pragmas[0].value ==\ - 'acc parallel loop tile(32,4,4) present(u)' assert trees[1][1].pragmas[0].value ==\ + 'acc parallel loop tile(32,4,4) present(u)' + assert trees[2][1].pragmas[0].value ==\ 'acc parallel loop tile(32,4) present(u)' # Only the AFFINE Iterations are tiled - assert trees[3][1].pragmas[0].value ==\ - 'acc parallel loop collapse(3) present(src,src_coords,u)' + assert trees[4][1].pragmas[0].value ==\ + 'acc parallel loop present(src,src_coords,u) deviceptr(r1,r2,r3)' @pytest.mark.parametrize('par_tile', [((32, 4, 4), (8, 8)), ((32, 4), (8, 8)), ((32, 4, 4), (8, 8, 8))]) @@ -130,11 +130,11 @@ def test_multiple_tile_sizes(self, par_tile): opt=('advanced', {'par-tile': par_tile})) trees = retrieve_iteration_tree(op) - assert len(trees) == 4 + assert len(trees) == 6 - assert trees[0][1].pragmas[0].value ==\ - 'acc parallel loop tile(32,4,4) present(u)' assert trees[1][1].pragmas[0].value ==\ + 'acc parallel loop tile(32,4,4) present(u)' + assert trees[2][1].pragmas[0].value ==\ 'acc parallel loop tile(8,8) present(u)' def test_multi_tile_blocking_structure(self): diff --git a/tests/test_gpu_openmp.py b/tests/test_gpu_openmp.py index bc2de717082..29866508d85 100644 --- a/tests/test_gpu_openmp.py +++ b/tests/test_gpu_openmp.py @@ -265,7 +265,7 @@ def test_timeparallel_reduction(self): assert not tree.root.pragmas assert len(tree[1].pragmas) == 1 assert tree[1].pragmas[0].value ==\ - ('omp target teams distribute parallel for collapse(2)' + ('omp target teams distribute parallel for collapse(3)' ' reduction(+:f[0])') diff --git a/tests/test_mpi.py b/tests/test_mpi.py index 2860fc726e4..51facd7a7c6 100644 --- a/tests/test_mpi.py +++ b/tests/test_mpi.py @@ -2558,7 +2558,8 @@ def test_adjoint_F_no_omp(self): # TestDecomposition().test_reshape_left_right() # TestOperatorSimple().test_trivial_eq_2d() # TestFunction().test_halo_exchange_bilateral() - TestSparseFunction().test_sparse_coords() + # TestSparseFunction().test_sparse_coords() # TestSparseFunction().test_precomputed_sparse(2) # TestOperatorAdvanced().test_fission_due_to_antidep() + TestOperatorAdvanced().test_injection_wodup_wtime() # TestIsotropicAcoustic().test_adjoint_F(1)