Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

compiler: Fix parlang reductions over >= 4 loops #2417

Merged
merged 2 commits into from
Jul 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 21 additions & 5 deletions devito/arch/archinfo.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
"""Collection of utilities to detect properties of the underlying architecture."""

from subprocess import PIPE, Popen, DEVNULL, run

from functools import cached_property
import cpuinfo
from subprocess import PIPE, Popen, DEVNULL, run
import ctypes
import numpy as np
import psutil
import re
import os
import sys
import json

import cpuinfo
import numpy as np
import psutil

from devito.logger import warning
from devito.tools import as_tuple, all_equal, memoized_func

Expand Down Expand Up @@ -664,6 +664,16 @@ def max_mem_trans_size(self, dtype):
assert self.max_mem_trans_nbytes % np.dtype(dtype).itemsize == 0
return int(self.max_mem_trans_nbytes / np.dtype(dtype).itemsize)

def limits(self, compiler=None, language=None):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do compiler and language get used anywhere? Why couldn't this be a property?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

they will in the future yeah, some limits may be compiler and/or language dependent

"""
Return the architecture-specific limits for the given compiler and
language.
"""
return {
'max-par-dims': sys.maxsize,
'max-block-dims': sys.maxsize,
}


class Cpu64(Platform):

Expand Down Expand Up @@ -832,6 +842,12 @@ def memavail(self, deviceid=0):
except (AttributeError, KeyError):
return None

def limits(self, compiler=None, language=None):
return {
'max-par-dims': 3,
'max-block-dims': 3,
}


class IntelDevice(Device):

Expand Down
58 changes: 38 additions & 20 deletions devito/ir/clusters/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,13 +504,10 @@ def reduction_comms(clusters):
return processed


def normalize(clusters, **kwargs):
options = kwargs['options']
sregistry = kwargs['sregistry']

def normalize(clusters, sregistry=None, options=None, platform=None, **kwargs):
clusters = normalize_nested_indexeds(clusters, sregistry)
if options['mapify-reduce']:
clusters = normalize_reductions_dense(clusters, sregistry)
clusters = normalize_reductions_dense(clusters, sregistry, platform)
else:
clusters = normalize_reductions_minmax(clusters)
clusters = normalize_reductions_sparse(clusters, sregistry)
Expand Down Expand Up @@ -594,31 +591,49 @@ def normalize_reductions_minmax(cluster):
return init + [cluster.rebuild(processed)]


def normalize_reductions_dense(cluster, sregistry):
def normalize_reductions_dense(cluster, sregistry, platform):
"""
Extract the right-hand sides of reduction Eq's in to temporaries.
"""
return _normalize_reductions_dense(cluster, sregistry, {})
return _normalize_reductions_dense(cluster, {}, sregistry, platform)


@cluster_pass(mode='dense')
def _normalize_reductions_dense(cluster, sregistry, mapper):
dims = [d for d in cluster.ispace.itdims
if cluster.properties.is_parallel_atomic(d)]
if not dims:
def _normalize_reductions_dense(cluster, mapper, sregistry, platform):
"""
Transform augmented expressions whose left-hand side is a scalar into
map-reduces.

Examples
--------
Given an increment expression such as

s += f(u[x], v[x], ...)

Turn it into

r[x] = f(u[x], v[x], ...)
s += r[x]
"""
# The candidate Dimensions along which to perform the map part
candidates = [d for d in cluster.ispace.itdims
if cluster.properties.is_parallel_atomic(d)]
if not candidates:
return cluster

# If there are more parallel dimensions than the maximum allowed by the
# target platform, we must restrain the number of candidates
max_par_dims = platform.limits()['max-par-dims']
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not just platform.max-par-dims as a property?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

because (see comment above) some of these limits may be e.g. programming-model dependent, besides being architecture-dependent

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The it's the same as having the compiler store it's complex/f16 dtype it doesn't belong there and should be part of a pass with if/dispatch

dims = candidates[-max_par_dims:]

# All other dimensions must be sequentialized because the output Array
# is constrained to `dims`
sequentialize = candidates[:-max_par_dims]

processed = []
properties = cluster.properties
for e in cluster.exprs:
if e.is_Reduction:
# Transform `e` into what is in essence an explicit map-reduce
# For example, turn:
# `s += f(u[x], v[x], ...)`
# into
# `r[x] = f(u[x], v[x], ...)`
# `s += r[x]`
# This makes it much easier to parallelize the map part regardless
# of the target backend
lhs, rhs = e.args

try:
Expand Down Expand Up @@ -650,10 +665,13 @@ def _normalize_reductions_dense(cluster, sregistry, mapper):

processed.extend([Eq(a.indexify(), rhs),
e.func(lhs, a.indexify())])

for d in sequentialize:
properties = properties.sequentialize(d)
else:
processed.append(e)

return cluster.rebuild(processed)
return cluster.rebuild(exprs=processed, properties=properties)


@cluster_pass(mode='sparse')
Expand Down
15 changes: 13 additions & 2 deletions devito/passes/clusters/blocking.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,14 @@ def _has_atomic_blockable_dim(self, cluster, d):
return any(cluster.properties.is_parallel_atomic(i)
for i in set(cluster.ispace.itdims) - {d})

def _has_enough_large_blockable_dims(self, cluster, d):
return len([i for i in set(cluster.ispace.itdims) - {d}
def _has_enough_large_blockable_dims(self, cluster, d, nested=False):
if nested:
_, ispace = cluster.ispace.split(d)
dims = set(ispace.itdims)
else:
ispace = cluster.ispace
dims = set(cluster.ispace.itdims) - {d}
return len([i for i in dims
if (cluster.properties.is_parallel_relaxed(i) and
not self._has_short_trip_count(i))]) >= 3

Expand All @@ -191,6 +197,11 @@ def callback(self, clusters, prefix):
# to have more than three large blockable Dimensions
return clusters

if self._has_enough_large_blockable_dims(c, d, nested=True):
# Virtually all device programming models forbid parallelism
# along more than three dimensions
return clusters

if any(self._has_short_trip_count(i) for i in c.ispace.itdims):
properties = c.properties.block(d, 'small')
elif self._has_data_reuse(c):
Expand Down
2 changes: 2 additions & 0 deletions devito/types/dimension.py
Original file line number Diff line number Diff line change
Expand Up @@ -1327,6 +1327,8 @@ def _arg_values(self, interval, grid=None, args=None, **kwargs):
# no value supplied -> the sub-block will span the entire block
return {name: args[self.parent.step.name]}
else:
# TODO": Check the args for space order and apply heuristics (e.g.,
# `2*space_order`?) for even better block sizes
value = self._arg_defaults()[name]
if value <= args[self.root.max_name] - args[self.root.min_name] + 1:
return {name: value}
Expand Down
23 changes: 21 additions & 2 deletions tests/test_gpu_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@
from devito import (Constant, Eq, Inc, Grid, Function, ConditionalDimension,
Dimension, MatrixSparseTimeFunction, SparseTimeFunction,
SubDimension, SubDomain, SubDomainSet, TimeFunction,
Operator, configuration, switchconfig, TensorTimeFunction)
Operator, configuration, switchconfig, TensorTimeFunction,
Buffer)
from devito.arch import get_gpu_info
from devito.exceptions import InvalidArgument
from devito.ir import (Conditional, Expression, Section, FindNodes, FindSymbols,
retrieve_iteration_tree)
from devito.passes.iet.languages.openmp import OmpIteration
from devito.types import DeviceID, DeviceRM, Lock, NPThreads, PThreadArray
from devito.types import DeviceID, DeviceRM, Lock, NPThreads, PThreadArray, Symbol

from conftest import skipif

Expand Down Expand Up @@ -147,6 +148,24 @@ def test_incr_perfect_outer(self):
op()
assert np.all(w.data == 10)

def test_reduction_many_dims(self):
grid = Grid(shape=(25, 25, 25))

u = TimeFunction(name='u', grid=grid, time_order=1, save=Buffer(1))
s = Symbol(name='s', dtype=np.float32)

eqns = [Eq(s, 0),
Inc(s, 2*u + 1)]

op0 = Operator(eqns)
op1 = Operator(eqns, opt=('advanced', {'mapify-reduce': True}))

tree, = retrieve_iteration_tree(op0)
assert 'collapse(4) reduction(+:s)' in str(tree.root.pragmas[0])

tree, = retrieve_iteration_tree(op1)
assert 'collapse(3) reduction(+:s)' in str(tree[1].pragmas[0])


class Bundle(SubDomain):
"""
Expand Down
Loading