From fa1a7aceba34c8b974c9201b791dbfa1164e3402 Mon Sep 17 00:00:00 2001 From: mloubout Date: Tue, 19 Sep 2023 09:51:08 -0400 Subject: [PATCH] compiler: improve interpolation parallelism --- devito/operations/interpolators.py | 5 +++++ tests/test_dle.py | 11 ++--------- tests/test_dse.py | 29 ++++++++++++++--------------- tests/test_gpu_openacc.py | 18 ++++++++++-------- tests/test_interpolation.py | 3 ++- tests/test_operator.py | 4 ++-- 6 files changed, 35 insertions(+), 35 deletions(-) diff --git a/devito/operations/interpolators.py b/devito/operations/interpolators.py index dae96d8dfe5..3d2dcb74660 100644 --- a/devito/operations/interpolators.py +++ b/devito/operations/interpolators.py @@ -305,6 +305,7 @@ def _inject(self, field, expr, implicit_dims=None): # Make iterable to support inject((u, v), expr=expr) # or inject((u, v), expr=(expr1, expr2)) fields, exprs = as_tuple(field), as_tuple(expr) + # Provide either one expr per field or on expr for all fields if len(fields) > 1: if len(exprs) == 1: @@ -323,6 +324,10 @@ def _inject(self, field, expr, implicit_dims=None): # Implicit dimensions implicit_dims = self._augment_implicit_dims(implicit_dims, variables) + # Move all temporaries inside inner loop to improve parallelism + # Can only be done for inject as interpolation need a temporary + # summing temp that wouldn't allow collapsing + implicit_dims = implicit_dims + tuple(r.parent for r in self._rdim) variables = variables + list(fields) diff --git a/tests/test_dle.py b/tests/test_dle.py index df3c4adfa58..8d58827a610 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -187,19 +187,12 @@ def test_cache_blocking_structure_optrelax(): op = Operator(eqns, opt=('advanced', {'blockrelax': True})) - bns, _ = assert_blocking(op, {'x0_blk0', 'p_src0_blk0', 'p_src1_blk0'}) + bns, _ = assert_blocking(op, {'x0_blk0', 'p_src0_blk0'}) iters = FindNodes(Iteration).visit(bns['p_src0_blk0']) - assert len(iters) == 2 - assert iters[0].dim.is_Block - assert iters[1].dim.is_Block - - iters = FindNodes(Iteration).visit(bns['p_src1_blk0']) assert len(iters) == 5 assert iters[0].dim.is_Block assert iters[1].dim.is_Block - for i in range(2, 5): - assert not iters[i].dim.is_Block def test_cache_blocking_structure_optrelax_customdim(): @@ -965,7 +958,7 @@ def test_parallel_prec_inject(self): iterations = FindNodes(Iteration).visit(op0) assert not iterations[0].pragmas - assert 'omp for collapse' in iterations[2].pragmas[0].value + assert 'omp for collapse' in iterations[1].pragmas[0].value class TestNestedParallelism(object): diff --git a/tests/test_dse.py b/tests/test_dse.py index 1e18157c77e..728f8f93578 100644 --- a/tests/test_dse.py +++ b/tests/test_dse.py @@ -48,9 +48,9 @@ def test_scheduling_after_rewrite(): trees = retrieve_iteration_tree(op) # Check loop nest structure - assert all(i.dim is j for i, j in zip(trees[1], grid.dimensions)) # time invariant - assert trees[2].root.dim is grid.time_dim - assert all(trees[2].root.dim is tree.root.dim for tree in trees[2:]) + assert all(i.dim is j for i, j in zip(trees[0], grid.dimensions)) # time invariant + assert trees[1].root.dim is grid.time_dim + assert all(trees[1].root.dim is tree.root.dim for tree in trees[1:]) @pytest.mark.parametrize('exprs,expected,min_cost', [ @@ -1687,7 +1687,7 @@ def test_drop_redundants_after_fusion(self, rotate): op = Operator(eqns, opt=('advanced', {'cire-rotate': rotate})) arrays = [i for i in FindSymbols().visit(op) if i.is_Array] - assert len(arrays) == 4 + assert len(arrays) == 2 assert all(i._mem_heap and not i._mem_external for i in arrays) def test_full_shape_big_temporaries(self): @@ -2711,11 +2711,10 @@ def test_fullopt(self): assert np.isclose(summary0[('section0', None)].oi, 2.851, atol=0.001) assert summary1[('section0', None)].ops == 9 - assert summary1[('section1', None)].ops == 9 - assert summary1[('section2', None)].ops == 31 - assert summary1[('section3', None)].ops == 26 - assert summary1[('section4', None)].ops == 22 - assert np.isclose(summary1[('section2', None)].oi, 1.767, atol=0.001) + assert summary1[('section1', None)].ops == 31 + assert summary1[('section2', None)].ops == 88 + assert summary1[('section3', None)].ops == 22 + assert np.isclose(summary1[('section1', None)].oi, 1.767, atol=0.001) assert np.allclose(u0.data, u1.data, atol=10e-5) assert np.allclose(rec0.data, rec1.data, atol=10e-5) @@ -2775,8 +2774,8 @@ def test_fullopt(self): assert np.allclose(self.tti_noopt[1].data, rec.data, atol=10e-1) # Check expected opcount/oi - assert summary[('section3', None)].ops == 92 - assert np.isclose(summary[('section3', None)].oi, 2.074, atol=0.001) + assert summary[('section2', None)].ops == 92 + assert np.isclose(summary[('section2', None)].oi, 2.074, atol=0.001) # With optimizations enabled, there should be exactly four BlockDimensions op = wavesolver.op_fwd() @@ -2794,7 +2793,7 @@ def test_fullopt(self): # 3 Arrays are defined globally for the sparse positions temporaries # and two additional bock-sized Arrays are defined locally arrays = [i for i in FindSymbols().visit(op) if i.is_Array] - extra_arrays = 2+3+3 + extra_arrays = 2+3 assert len(arrays) == 4 + extra_arrays assert all(i._mem_heap and not i._mem_external for i in arrays) bns, pbs = assert_blocking(op, {'x0_blk0'}) @@ -2830,7 +2829,7 @@ def test_fullopt_w_mpi(self): def test_opcounts(self, space_order, expected): op = self.tti_operator(opt='advanced', space_order=space_order) sections = list(op.op_fwd()._profiler._sections.values()) - assert sections[3].sops == expected + assert sections[2].sops == expected @switchconfig(profiling='advanced') @pytest.mark.parametrize('space_order,expected', [ @@ -2840,8 +2839,8 @@ def test_opcounts_adjoint(self, space_order, expected): wavesolver = self.tti_operator(opt=('advanced', {'openmp': False})) op = wavesolver.op_adj() - assert op._profiler._sections['section3'].sops == expected - assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == 7+3+3 + assert op._profiler._sections['section2'].sops == expected + assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == 7+3 class TestTTIv2(object): diff --git a/tests/test_gpu_openacc.py b/tests/test_gpu_openacc.py index db92db3c83f..3085ad85c99 100644 --- a/tests/test_gpu_openacc.py +++ b/tests/test_gpu_openacc.py @@ -102,15 +102,15 @@ def test_tile_insteadof_collapse(self, par_tile): opt=('advanced', {'par-tile': par_tile})) trees = retrieve_iteration_tree(op) - assert len(trees) == 6 + assert len(trees) == 4 - assert trees[1][1].pragmas[0].value ==\ + assert trees[0][1].pragmas[0].value ==\ 'acc parallel loop tile(32,4,4) present(u)' - assert trees[2][1].pragmas[0].value ==\ + assert trees[1][1].pragmas[0].value ==\ 'acc parallel loop tile(32,4) present(u)' # Only the AFFINE Iterations are tiled - assert trees[4][1].pragmas[0].value ==\ - 'acc parallel loop present(src,src_coords,u) deviceptr(r1,r2,r3)' + assert trees[3][1].pragmas[0].value ==\ + 'acc parallel loop collapse(4) present(src,src_coords,u)' @pytest.mark.parametrize('par_tile', [((32, 4, 4), (8, 8)), ((32, 4), (8, 8)), ((32, 4, 4), (8, 8, 8))]) @@ -130,12 +130,14 @@ def test_multiple_tile_sizes(self, par_tile): opt=('advanced', {'par-tile': par_tile})) trees = retrieve_iteration_tree(op) - assert len(trees) == 6 + assert len(trees) == 4 - assert trees[1][1].pragmas[0].value ==\ + assert trees[0][1].pragmas[0].value ==\ 'acc parallel loop tile(32,4,4) present(u)' - assert trees[2][1].pragmas[0].value ==\ + assert trees[1][1].pragmas[0].value ==\ 'acc parallel loop tile(8,8) present(u)' + assert trees[3][1].pragmas[0].value ==\ + 'acc parallel loop collapse(4) present(src,src_coords,u)' def test_multi_tile_blocking_structure(self): grid = Grid(shape=(8, 8, 8)) diff --git a/tests/test_interpolation.py b/tests/test_interpolation.py index c7a15665a48..97d86c1759f 100644 --- a/tests/test_interpolation.py +++ b/tests/test_interpolation.py @@ -5,7 +5,7 @@ from sympy import Float from devito import (Grid, Operator, Dimension, SparseFunction, SparseTimeFunction, - Function, TimeFunction, DefaultDimension, Eq, + Function, TimeFunction, DefaultDimension, Eq, switchconfig, PrecomputedSparseFunction, PrecomputedSparseTimeFunction, MatrixSparseTimeFunction) from examples.seismic import (demo_model, TimeAxis, RickerSource, Receiver, @@ -736,6 +736,7 @@ class SparseFirst(SparseFunction): assert np.allclose(s.data, expected) +@switchconfig(safe_math=True) def test_inject_function(): nt = 11 diff --git a/tests/test_operator.py b/tests/test_operator.py index 4f8228bc24a..3064565e3c9 100644 --- a/tests/test_operator.py +++ b/tests/test_operator.py @@ -1805,7 +1805,7 @@ def test_scheduling_sparse_functions(self): # `trees` than 6 op = Operator([eqn1] + eqn2 + [eqn3] + eqn4, opt=('noop', {'openmp': False})) trees = retrieve_iteration_tree(op) - assert len(trees) == 6 + assert len(trees) == 5 # Time loop not shared due to the WAR assert trees[0][0].dim is time and trees[0][0] is trees[1][0] # this IS shared assert trees[1][0] is not trees[3][0] @@ -1815,7 +1815,7 @@ def test_scheduling_sparse_functions(self): eqn2 = sf1.inject(u1.forward, expr=sf1) op = Operator([eqn1] + eqn2 + [eqn3] + eqn4, opt=('noop', {'openmp': False})) trees = retrieve_iteration_tree(op) - assert len(trees) == 6 + assert len(trees) == 5 assert all(trees[0][0] is i[0] for i in trees) def test_scheduling_with_free_dims(self):