Skip to content

Commit

Permalink
Merge pull request #2129 from devitocodes/drop-collapse1-nvc
Browse files Browse the repository at this point in the history
compiler: Avoid generating collapse(1)
  • Loading branch information
FabioLuporini authored May 17, 2023
2 parents 526dcb8 + 58efbeb commit c7d15b6
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 19 deletions.
6 changes: 3 additions & 3 deletions devito/passes/iet/languages/openacc.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@ def _make_construct(cls, **kwargs):
return 'acc parallel loop'

@classmethod
def _make_clauses(cls, ncollapsed=None, reduction=None, tile=None, **kwargs):
def _make_clauses(cls, ncollapsed=0, reduction=None, tile=None, **kwargs):
clauses = []

if tile:
clauses.append('tile(%s)' % ','.join(str(i) for i in tile))
elif ncollapsed:
clauses.append('collapse(%d)' % (ncollapsed or 1))
elif ncollapsed > 1:
clauses.append('collapse(%d)' % ncollapsed)

if reduction:
clauses.append(cls._make_clause_reduction_from_imask(reduction))
Expand Down
5 changes: 3 additions & 2 deletions devito/passes/iet/languages/openmp.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,12 @@ def _make_construct(cls, parallel=False, **kwargs):
return 'omp for'

@classmethod
def _make_clauses(cls, ncollapsed=None, chunk_size=None, nthreads=None,
def _make_clauses(cls, ncollapsed=0, chunk_size=None, nthreads=None,
reduction=None, schedule=None, **kwargs):
clauses = []

clauses.append('collapse(%d)' % (ncollapsed or 1))
if ncollapsed > 1:
clauses.append('collapse(%d)' % ncollapsed)

if chunk_size is not False:
clauses.append('schedule(%s,%s)' % (schedule or 'dynamic',
Expand Down
8 changes: 4 additions & 4 deletions examples/performance/00_overview.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -716,7 +716,7 @@
" const int tid = omp_get_thread_num();\n",
" float (*restrict r0)[z_size] __attribute__ ((aligned (64))) = (float (*)[z_size]) pr0[tid];\n",
"\n",
" #pragma omp for collapse(1) schedule(dynamic,1)\n",
" #pragma omp for schedule(dynamic,1)\n",
" for (int x = x_m; x <= x_M; x += 1)\n",
" {\n",
" for (int y = y_m - 2; y <= y_M + 2; y += 1)\n",
Expand Down Expand Up @@ -855,7 +855,7 @@
" const int tid = omp_get_thread_num();\n",
" float (*restrict r1)[z_size] __attribute__ ((aligned (64))) = (float (*)[z_size]) pr1[tid];\n",
"\n",
" #pragma omp for collapse(1) schedule(dynamic,1)\n",
" #pragma omp for schedule(dynamic,1)\n",
" for (int x = x_m; x <= x_M; x += 1)\n",
" {\n",
" for (int y = y_m - 2; y <= y_M + 2; y += 1)\n",
Expand Down Expand Up @@ -991,7 +991,7 @@
" const int tid = omp_get_thread_num();\n",
" float (*restrict r0)[z_size] __attribute__ ((aligned (64))) = (float (*)[z_size]) pr0[tid];\n",
"\n",
" #pragma omp for collapse(1) schedule(dynamic,1)\n",
" #pragma omp for schedule(dynamic,1)\n",
" for (int x = x_m; x <= x_M; x += 1)\n",
" {\n",
" for (int y = y_m - 2; y <= y_M + 2; y += 1)\n",
Expand Down Expand Up @@ -1557,7 +1557,7 @@
" const int tid = omp_get_thread_num();\n",
" float (*restrict r2)[z_size] __attribute__ ((aligned (64))) = (float (*)[z_size]) pr2[tid];\n",
"\n",
" #pragma omp for collapse(1) schedule(dynamic,1)\n",
" #pragma omp for schedule(dynamic,1)\n",
" for (int x = x_m; x <= x_M; x += 1)\n",
" {\n",
" for (int y = y_m - 2; y <= y_M + 2; y += 1)\n",
Expand Down
23 changes: 14 additions & 9 deletions tests/test_dle.py
Original file line number Diff line number Diff line change
Expand Up @@ -816,7 +816,8 @@ def test_incs_no_atomic(self):
# Now only `x` is parallelized
op1 = Operator([Eq(v[t, x, 0, 0], v[t, x, 0, 0] + 1), Inc(uf, 1)],
opt=('advanced', {'openmp': True, 'par-collapse-ncores': 1}))
assert 'collapse(1)' in str(op1)
assert 'omp for' in str(op1)
assert 'collapse' not in str(op1)
assert 'atomic' not in str(op1)

@pytest.mark.parametrize('exprs,simd_level,expected', [
Expand Down Expand Up @@ -879,18 +880,22 @@ def test_edge_cases(self, exprs, simd_level, expected):
for i, e in enumerate(list(exprs)):
exprs[i] = eval(e)

op = Operator(exprs, opt=('advanced', {'openmp': True}))
op = Operator(exprs, opt=('advanced', {'openmp': True,
'par-collapse-ncores': 1}))

iterations = FindNodes(Iteration).visit(op)
parallel = [i for i in iterations if i.is_Parallel]
try:
assert 'omp for collapse' in iterations[0].pragmas[0].value
assert 'omp for' in iterations[0].pragmas[0].value
if len(parallel) > 1 and simd_level is not None and simd_level > 1:
assert 'collapse' in iterations[0].pragmas[0].value
if simd_level:
assert 'omp simd' in iterations[simd_level].pragmas[0].value
except:
# E.g. gcc-5 doesn't support array reductions, so the compiler will
# generate different legal code
assert not Ompizer._support_array_reduction(configuration['compiler'])
assert any('omp for collapse' in i.pragmas[0].value
assert any('omp for' in i.pragmas[0].value
for i in iterations if i.pragmas)

op.apply()
Expand All @@ -910,7 +915,7 @@ def test_simd_space_invariant(self):
op = Operator(eq, opt=('advanced', {'openmp': True}))
iterations = FindNodes(Iteration).visit(op)

assert 'omp for collapse(1) schedule(static,1)' in iterations[0].pragmas[0].value
assert 'omp for schedule(static,1)' in iterations[0].pragmas[0].value
assert 'omp simd' in iterations[1].pragmas[0].value
assert 'omp simd' in iterations[3].pragmas[0].value

Expand Down Expand Up @@ -979,8 +984,8 @@ def test_basic(self):
bns, _ = assert_blocking(op, {'x0_blk0'})

iterations = FindNodes(Iteration).visit(bns['x0_blk0'])
assert iterations[0].pragmas[0].value == 'omp for collapse(1) schedule(dynamic,1)'
assert iterations[2].pragmas[0].value == ('omp parallel for collapse(1) '
assert iterations[0].pragmas[0].value == 'omp for schedule(dynamic,1)'
assert iterations[2].pragmas[0].value == ('omp parallel for '
'schedule(dynamic,1) '
'num_threads(nthreads_nested)')

Expand Down Expand Up @@ -1073,11 +1078,11 @@ def test_multiple_subnests_v1(self):
'omp for collapse(2) schedule(dynamic,1)'
assert not trees[0][2].pragmas
assert not trees[0][3].pragmas
assert trees[0][4].pragmas[0].value == ('omp parallel for collapse(1) '
assert trees[0][4].pragmas[0].value == ('omp parallel for '
'schedule(dynamic,1) '
'num_threads(nthreads_nested)')
assert not trees[1][2].pragmas
assert trees[1][3].pragmas[0].value == ('omp parallel for collapse(1) '
assert trees[1][3].pragmas[0].value == ('omp parallel for '
'schedule(dynamic,1) '
'num_threads(nthreads_nested)')

Expand Down
2 changes: 1 addition & 1 deletion tests/test_gpu_openacc.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def test_tile_insteadof_collapse(self, par_tile):
'acc parallel loop tile(32,4) present(u)'
# Only the AFFINE Iterations are tiled
assert trees[3][1].pragmas[0].value ==\
'acc parallel loop collapse(1) present(src,src_coords,u)'
'acc parallel loop present(src,src_coords,u)'

@pytest.mark.parametrize('par_tile', [((32, 4, 4), (8, 8)), ((32, 4), (8, 8)),
((32, 4, 4), (8, 8, 8))])
Expand Down

0 comments on commit c7d15b6

Please sign in to comment.