Skip to content

Commit f7ab007

Browse files
committed
compiler: prevent halo to be moved outside their iteration space
1 parent 87d8d0e commit f7ab007

File tree

8 files changed

+45
-19
lines changed

8 files changed

+45
-19
lines changed

devito/ir/stree/algorithms.py

+6
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,12 @@ def preprocess(clusters, options=None, **kwargs):
147147
found = []
148148
for c1 in list(queue):
149149
distributed_aindices = c1.halo_scheme.distributed_aindices
150+
h_indices = set().union(*[(d, d.root)
151+
for d in c1.halo_scheme.loc_indices])
152+
153+
# Skip if the Halo echange would end up outside its need iteration space
154+
if h_indices and not h_indices & dims:
155+
continue
150156

151157
diff = dims - distributed_aindices
152158
intersection = dims & distributed_aindices

devito/mpi/halo_scheme.py

+4
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,10 @@ def distributed(self):
361361
def distributed_aindices(self):
362362
return set().union(*[i.dims for i in self.fmapper.values()])
363363

364+
@cached_property
365+
def loc_indices(self):
366+
return set().union(*[i.loc_indices.keys() for i in self.fmapper.values()])
367+
364368
@cached_property
365369
def arguments(self):
366370
return self.dimensions | set(flatten(self.honored.values()))

devito/passes/iet/langbase.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -214,8 +214,8 @@ def DeviceIteration(self):
214214
def Prodder(self):
215215
return self.lang.Prodder
216216

217-
def _is_offloadable(self, *args, **kwargs):
218-
return False
217+
def _n_device_pointers(self, *args, **kwargs):
218+
return 0
219219

220220

221221
class DeviceAwareMixin(object):
@@ -328,6 +328,12 @@ def _(iet):
328328

329329
return _initialize(iet)
330330

331+
def _n_device_pointers(self, iet):
332+
functions = FindSymbols().visit(iet)
333+
devfuncs = [f for f in functions if f.is_Array and f._mem_local]
334+
335+
return len(devfuncs)
336+
331337
def _is_offloadable(self, iet):
332338
"""
333339
True if the IET computation is offloadable to device, False otherwise.
@@ -339,7 +345,8 @@ def _is_offloadable(self, iet):
339345
functions = FindSymbols().visit(iet)
340346
buffers = [f for f in functions if f.is_Array and f._mem_mapped]
341347
hostfuncs = [f for f in functions if not is_on_device(f, self.gpu_fit)]
342-
return not (buffers and hostfuncs)
348+
349+
return not (hostfuncs and buffers)
343350

344351

345352
class Sections(tuple):

devito/passes/iet/parpragma.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -295,13 +295,13 @@ def _select_candidates(self, candidates):
295295
except TypeError:
296296
pass
297297

298-
collapsable.append(i)
298+
collapsable.append(i)
299299

300300
# Give a score to this candidate, based on the number of fully-parallel
301301
# Iterations and their position (i.e. outermost to innermost) in the nest
302302
score = (
303303
int(root.is_ParallelNoAtomic),
304-
-int(self._is_offloadable(root))*(n0 + 1), # Outermost offloadable
304+
self._n_device_pointers(root), # Outermost offloadable
305305
int(len([i for i in collapsable if i.is_ParallelNoAtomic]) >= 1),
306306
int(len([i for i in collapsable if i.is_ParallelRelaxed]) >= 1),
307307
-(n0 + 1) # The outermost, the better
@@ -375,6 +375,12 @@ def _make_partree(self, candidates, nthreads=None):
375375
ncollapsed=ncollapsed, nthreads=nthreads,
376376
**root.args)
377377
prefix = []
378+
elif all(i.is_ParallelRelaxed for i in candidates) and nthreads is not None:
379+
body = self.HostIteration(schedule='static',
380+
parallel=nthreads is not self.nthreads_nested,
381+
ncollapsed=ncollapsed, nthreads=nthreads,
382+
**root.args)
383+
prefix = []
378384
else:
379385
# pragma ... for ... schedule(..., expr)
380386
assert nthreads is None

tests/test_dle.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ def test_cache_blocking_structure_optrelax_prec_inject():
291291
'openmp': True,
292292
'par-collapse-ncores': 1}))
293293

294-
assert_structure(op, ['t', 't,p_s0_blk0,p_s', 't,p_s0_blk0,p_s,rsx,rsy'],
294+
assert_structure(op, ['t,p_s0_blk0,p_s', 't,p_s0_blk0,p_s,rsx,rsy'],
295295
't,p_s0_blk0,p_s,rsx,rsy')
296296

297297

@@ -821,12 +821,13 @@ def test_incs_no_atomic(self):
821821
'par-collapse-ncores': 1,
822822
'par-collapse-work': 0}))
823823

824-
assert 'collapse(2)' in str(op0)
824+
assert 'collapse(3)' in str(op0)
825825
assert 'atomic' in str(op0)
826826

827827
# Now only `x` is parallelized
828828
op1 = Operator([Eq(v[t, x, 0, 0], v[t, x, 0, 0] + 1), Inc(uf, 1)],
829829
opt=('advanced', {'openmp': True, 'par-collapse-ncores': 1}))
830+
830831
assert 'omp for' in str(op1)
831832
assert 'collapse' not in str(op1)
832833
assert 'atomic' not in str(op1)
@@ -951,19 +952,20 @@ def test_parallel_prec_inject(self):
951952
eqns = sf.inject(field=u.forward, expr=sf * dt**2)
952953

953954
op0 = Operator(eqns, opt=('advanced', {'openmp': True,
954-
'par-collapse-ncores': 1}))
955+
'par-collapse-ncores': 20}))
955956
iterations = FindNodes(Iteration).visit(op0)
956957

957958
assert not iterations[0].pragmas
958959
assert 'omp for' in iterations[1].pragmas[0].value
960+
assert 'collapse' not in iterations[1].pragmas[0].value
959961

960962
op0 = Operator(eqns, opt=('advanced', {'openmp': True,
961963
'par-collapse-ncores': 1,
962964
'par-collapse-work': 1}))
963965
iterations = FindNodes(Iteration).visit(op0)
964966

965967
assert not iterations[0].pragmas
966-
assert 'omp for' in iterations[1].pragmas[0].value
968+
assert 'omp for collapse' in iterations[2].pragmas[0].value
967969

968970

969971
class TestNestedParallelism(object):

tests/test_gpu_openacc.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -102,15 +102,15 @@ def test_tile_insteadof_collapse(self, par_tile):
102102
opt=('advanced', {'par-tile': par_tile}))
103103

104104
trees = retrieve_iteration_tree(op)
105-
assert len(trees) == 4
105+
assert len(trees) == 6
106106

107-
assert trees[0][1].pragmas[0].value ==\
108-
'acc parallel loop tile(32,4,4) present(u)'
109107
assert trees[1][1].pragmas[0].value ==\
108+
'acc parallel loop tile(32,4,4) present(u)'
109+
assert trees[2][1].pragmas[0].value ==\
110110
'acc parallel loop tile(32,4) present(u)'
111111
# Only the AFFINE Iterations are tiled
112-
assert trees[3][1].pragmas[0].value ==\
113-
'acc parallel loop collapse(3) present(src,src_coords,u)'
112+
assert trees[4][1].pragmas[0].value ==\
113+
'acc parallel loop present(src,src_coords,u) deviceptr(r1,r2,r3)'
114114

115115
@pytest.mark.parametrize('par_tile', [((32, 4, 4), (8, 8)), ((32, 4), (8, 8)),
116116
((32, 4, 4), (8, 8, 8))])
@@ -130,11 +130,11 @@ def test_multiple_tile_sizes(self, par_tile):
130130
opt=('advanced', {'par-tile': par_tile}))
131131

132132
trees = retrieve_iteration_tree(op)
133-
assert len(trees) == 4
133+
assert len(trees) == 6
134134

135-
assert trees[0][1].pragmas[0].value ==\
136-
'acc parallel loop tile(32,4,4) present(u)'
137135
assert trees[1][1].pragmas[0].value ==\
136+
'acc parallel loop tile(32,4,4) present(u)'
137+
assert trees[2][1].pragmas[0].value ==\
138138
'acc parallel loop tile(8,8) present(u)'
139139

140140
def test_multi_tile_blocking_structure(self):

tests/test_gpu_openmp.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ def test_timeparallel_reduction(self):
265265
assert not tree.root.pragmas
266266
assert len(tree[1].pragmas) == 1
267267
assert tree[1].pragmas[0].value ==\
268-
('omp target teams distribute parallel for collapse(2)'
268+
('omp target teams distribute parallel for collapse(3)'
269269
' reduction(+:f[0])')
270270

271271

tests/test_mpi.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -2558,7 +2558,8 @@ def test_adjoint_F_no_omp(self):
25582558
# TestDecomposition().test_reshape_left_right()
25592559
# TestOperatorSimple().test_trivial_eq_2d()
25602560
# TestFunction().test_halo_exchange_bilateral()
2561-
TestSparseFunction().test_sparse_coords()
2561+
# TestSparseFunction().test_sparse_coords()
25622562
# TestSparseFunction().test_precomputed_sparse(2)
25632563
# TestOperatorAdvanced().test_fission_due_to_antidep()
2564+
TestOperatorAdvanced().test_injection_wodup_wtime()
25642565
# TestIsotropicAcoustic().test_adjoint_F(1)

0 commit comments

Comments
 (0)