From 58efbebb03bd7c4dfb2df920a3c628299124f169 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 17 May 2023 07:33:56 +0000 Subject: [PATCH] compiler: Avoid generating collapse(1) --- devito/passes/iet/languages/openacc.py | 6 +++--- devito/passes/iet/languages/openmp.py | 5 +++-- examples/performance/00_overview.ipynb | 8 ++++---- tests/test_dle.py | 23 ++++++++++++++--------- tests/test_gpu_openacc.py | 2 +- 5 files changed, 25 insertions(+), 19 deletions(-) diff --git a/devito/passes/iet/languages/openacc.py b/devito/passes/iet/languages/openacc.py index 8887c0a4a7..af23264b16 100644 --- a/devito/passes/iet/languages/openacc.py +++ b/devito/passes/iet/languages/openacc.py @@ -26,13 +26,13 @@ def _make_construct(cls, **kwargs): return 'acc parallel loop' @classmethod - def _make_clauses(cls, ncollapsed=None, reduction=None, tile=None, **kwargs): + def _make_clauses(cls, ncollapsed=0, reduction=None, tile=None, **kwargs): clauses = [] if tile: clauses.append('tile(%s)' % ','.join(str(i) for i in tile)) - elif ncollapsed: - clauses.append('collapse(%d)' % (ncollapsed or 1)) + elif ncollapsed > 1: + clauses.append('collapse(%d)' % ncollapsed) if reduction: clauses.append(cls._make_clause_reduction_from_imask(reduction)) diff --git a/devito/passes/iet/languages/openmp.py b/devito/passes/iet/languages/openmp.py index 985c41915c..0a6876e608 100644 --- a/devito/passes/iet/languages/openmp.py +++ b/devito/passes/iet/languages/openmp.py @@ -40,11 +40,12 @@ def _make_construct(cls, parallel=False, **kwargs): return 'omp for' @classmethod - def _make_clauses(cls, ncollapsed=None, chunk_size=None, nthreads=None, + def _make_clauses(cls, ncollapsed=0, chunk_size=None, nthreads=None, reduction=None, schedule=None, **kwargs): clauses = [] - clauses.append('collapse(%d)' % (ncollapsed or 1)) + if ncollapsed > 1: + clauses.append('collapse(%d)' % ncollapsed) if chunk_size is not False: clauses.append('schedule(%s,%s)' % (schedule or 'dynamic', diff --git a/examples/performance/00_overview.ipynb b/examples/performance/00_overview.ipynb index 161343724f..8d8a75e922 100644 --- a/examples/performance/00_overview.ipynb +++ b/examples/performance/00_overview.ipynb @@ -716,7 +716,7 @@ " const int tid = omp_get_thread_num();\n", " float (*restrict r0)[z_size] __attribute__ ((aligned (64))) = (float (*)[z_size]) pr0[tid];\n", "\n", - " #pragma omp for collapse(1) schedule(dynamic,1)\n", + " #pragma omp for schedule(dynamic,1)\n", " for (int x = x_m; x <= x_M; x += 1)\n", " {\n", " for (int y = y_m - 2; y <= y_M + 2; y += 1)\n", @@ -855,7 +855,7 @@ " const int tid = omp_get_thread_num();\n", " float (*restrict r1)[z_size] __attribute__ ((aligned (64))) = (float (*)[z_size]) pr1[tid];\n", "\n", - " #pragma omp for collapse(1) schedule(dynamic,1)\n", + " #pragma omp for schedule(dynamic,1)\n", " for (int x = x_m; x <= x_M; x += 1)\n", " {\n", " for (int y = y_m - 2; y <= y_M + 2; y += 1)\n", @@ -991,7 +991,7 @@ " const int tid = omp_get_thread_num();\n", " float (*restrict r0)[z_size] __attribute__ ((aligned (64))) = (float (*)[z_size]) pr0[tid];\n", "\n", - " #pragma omp for collapse(1) schedule(dynamic,1)\n", + " #pragma omp for schedule(dynamic,1)\n", " for (int x = x_m; x <= x_M; x += 1)\n", " {\n", " for (int y = y_m - 2; y <= y_M + 2; y += 1)\n", @@ -1557,7 +1557,7 @@ " const int tid = omp_get_thread_num();\n", " float (*restrict r2)[z_size] __attribute__ ((aligned (64))) = (float (*)[z_size]) pr2[tid];\n", "\n", - " #pragma omp for collapse(1) schedule(dynamic,1)\n", + " #pragma omp for schedule(dynamic,1)\n", " for (int x = x_m; x <= x_M; x += 1)\n", " {\n", " for (int y = y_m - 2; y <= y_M + 2; y += 1)\n", diff --git a/tests/test_dle.py b/tests/test_dle.py index 77218b4d6a..9292b52664 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -816,7 +816,8 @@ def test_incs_no_atomic(self): # Now only `x` is parallelized op1 = Operator([Eq(v[t, x, 0, 0], v[t, x, 0, 0] + 1), Inc(uf, 1)], opt=('advanced', {'openmp': True, 'par-collapse-ncores': 1})) - assert 'collapse(1)' in str(op1) + assert 'omp for' in str(op1) + assert 'collapse' not in str(op1) assert 'atomic' not in str(op1) @pytest.mark.parametrize('exprs,simd_level,expected', [ @@ -879,18 +880,22 @@ def test_edge_cases(self, exprs, simd_level, expected): for i, e in enumerate(list(exprs)): exprs[i] = eval(e) - op = Operator(exprs, opt=('advanced', {'openmp': True})) + op = Operator(exprs, opt=('advanced', {'openmp': True, + 'par-collapse-ncores': 1})) iterations = FindNodes(Iteration).visit(op) + parallel = [i for i in iterations if i.is_Parallel] try: - assert 'omp for collapse' in iterations[0].pragmas[0].value + assert 'omp for' in iterations[0].pragmas[0].value + if len(parallel) > 1 and simd_level is not None and simd_level > 1: + assert 'collapse' in iterations[0].pragmas[0].value if simd_level: assert 'omp simd' in iterations[simd_level].pragmas[0].value except: # E.g. gcc-5 doesn't support array reductions, so the compiler will # generate different legal code assert not Ompizer._support_array_reduction(configuration['compiler']) - assert any('omp for collapse' in i.pragmas[0].value + assert any('omp for' in i.pragmas[0].value for i in iterations if i.pragmas) op.apply() @@ -910,7 +915,7 @@ def test_simd_space_invariant(self): op = Operator(eq, opt=('advanced', {'openmp': True})) iterations = FindNodes(Iteration).visit(op) - assert 'omp for collapse(1) schedule(static,1)' in iterations[0].pragmas[0].value + assert 'omp for schedule(static,1)' in iterations[0].pragmas[0].value assert 'omp simd' in iterations[1].pragmas[0].value assert 'omp simd' in iterations[3].pragmas[0].value @@ -979,8 +984,8 @@ def test_basic(self): bns, _ = assert_blocking(op, {'x0_blk0'}) iterations = FindNodes(Iteration).visit(bns['x0_blk0']) - assert iterations[0].pragmas[0].value == 'omp for collapse(1) schedule(dynamic,1)' - assert iterations[2].pragmas[0].value == ('omp parallel for collapse(1) ' + assert iterations[0].pragmas[0].value == 'omp for schedule(dynamic,1)' + assert iterations[2].pragmas[0].value == ('omp parallel for ' 'schedule(dynamic,1) ' 'num_threads(nthreads_nested)') @@ -1073,11 +1078,11 @@ def test_multiple_subnests_v1(self): 'omp for collapse(2) schedule(dynamic,1)' assert not trees[0][2].pragmas assert not trees[0][3].pragmas - assert trees[0][4].pragmas[0].value == ('omp parallel for collapse(1) ' + assert trees[0][4].pragmas[0].value == ('omp parallel for ' 'schedule(dynamic,1) ' 'num_threads(nthreads_nested)') assert not trees[1][2].pragmas - assert trees[1][3].pragmas[0].value == ('omp parallel for collapse(1) ' + assert trees[1][3].pragmas[0].value == ('omp parallel for ' 'schedule(dynamic,1) ' 'num_threads(nthreads_nested)') diff --git a/tests/test_gpu_openacc.py b/tests/test_gpu_openacc.py index d7ebb1b2ed..38b7eb5514 100644 --- a/tests/test_gpu_openacc.py +++ b/tests/test_gpu_openacc.py @@ -110,7 +110,7 @@ def test_tile_insteadof_collapse(self, par_tile): 'acc parallel loop tile(32,4) present(u)' # Only the AFFINE Iterations are tiled assert trees[3][1].pragmas[0].value ==\ - 'acc parallel loop collapse(1) present(src,src_coords,u)' + 'acc parallel loop present(src,src_coords,u)' @pytest.mark.parametrize('par_tile', [((32, 4, 4), (8, 8)), ((32, 4), (8, 8)), ((32, 4, 4), (8, 8, 8))])