Skip to content

Commit

Permalink
compiler: Fix parlang reductions over >= 4 loops
Browse files Browse the repository at this point in the history
  • Loading branch information
FabioLuporini committed Jul 19, 2024
1 parent ea94a0f commit b22bb1e
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 29 deletions.
26 changes: 21 additions & 5 deletions devito/arch/archinfo.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
"""Collection of utilities to detect properties of the underlying architecture."""

from subprocess import PIPE, Popen, DEVNULL, run

from functools import cached_property
import cpuinfo
from subprocess import PIPE, Popen, DEVNULL, run
import ctypes
import numpy as np
import psutil
import re
import os
import sys
import json

import cpuinfo
import numpy as np
import psutil

from devito.logger import warning
from devito.tools import as_tuple, all_equal, memoized_func

Expand Down Expand Up @@ -664,6 +664,16 @@ def max_mem_trans_size(self, dtype):
assert self.max_mem_trans_nbytes % np.dtype(dtype).itemsize == 0
return int(self.max_mem_trans_nbytes / np.dtype(dtype).itemsize)

def limits(self, compiler=None, language=None):
"""
Return the architecture-specific limits for the given compiler and
language.
"""
return {
'max-par-dims': np.inf,
'max-block-dims': np.inf,
}


class Cpu64(Platform):

Expand Down Expand Up @@ -832,6 +842,12 @@ def memavail(self, deviceid=0):
except (AttributeError, KeyError):
return None

def limits(self, compiler=None, language=None):
return {
'max-par-dims': 3,
'max-block-dims': 3,
}


class IntelDevice(Device):

Expand Down
58 changes: 38 additions & 20 deletions devito/ir/clusters/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,13 +504,10 @@ def reduction_comms(clusters):
return processed


def normalize(clusters, **kwargs):
options = kwargs['options']
sregistry = kwargs['sregistry']

def normalize(clusters, sregistry=None, options=None, platform=None, **kwargs):
clusters = normalize_nested_indexeds(clusters, sregistry)
if options['mapify-reduce']:
clusters = normalize_reductions_dense(clusters, sregistry)
clusters = normalize_reductions_dense(clusters, sregistry, platform)
else:
clusters = normalize_reductions_minmax(clusters)
clusters = normalize_reductions_sparse(clusters, sregistry)
Expand Down Expand Up @@ -594,31 +591,49 @@ def normalize_reductions_minmax(cluster):
return init + [cluster.rebuild(processed)]


def normalize_reductions_dense(cluster, sregistry):
def normalize_reductions_dense(cluster, sregistry, platform):
"""
Extract the right-hand sides of reduction Eq's in to temporaries.
"""
return _normalize_reductions_dense(cluster, sregistry, {})
return _normalize_reductions_dense(cluster, {}, sregistry, platform)


@cluster_pass(mode='dense')
def _normalize_reductions_dense(cluster, sregistry, mapper):
dims = [d for d in cluster.ispace.itdims
if cluster.properties.is_parallel_atomic(d)]
if not dims:
def _normalize_reductions_dense(cluster, mapper, sregistry, platform):
"""
Transform augmented expressions whose left-hand side is a scalar into
map-reduces.
Examples
--------
Given an increment expression such as
s += f(u[x], v[x], ...)
Turn it into
r[x] = f(u[x], v[x], ...)
s += r[x]
"""
# The candidate Dimensions along which to perform the map part
candidates = [d for d in cluster.ispace.itdims
if cluster.properties.is_parallel_atomic(d)]
if not candidates:
return cluster

# If there are more parallel dimensions than the maximum allowed by the
# target platform, we must restrain the number of candidates
max_par_dims = platform.limits()['max-par-dims']
dims = candidates[-max_par_dims:]

# All other dimensions must be sequentialized because the output Array
# is constrained to `dims`
sequentialize = candidates[:-max_par_dims]

processed = []
properties = cluster.properties
for e in cluster.exprs:
if e.is_Reduction:
# Transform `e` into what is in essence an explicit map-reduce
# For example, turn:
# `s += f(u[x], v[x], ...)`
# into
# `r[x] = f(u[x], v[x], ...)`
# `s += r[x]`
# This makes it much easier to parallelize the map part regardless
# of the target backend
lhs, rhs = e.args

try:
Expand Down Expand Up @@ -650,10 +665,13 @@ def _normalize_reductions_dense(cluster, sregistry, mapper):

processed.extend([Eq(a.indexify(), rhs),
e.func(lhs, a.indexify())])

for d in sequentialize:
properties = properties.sequentialize(d)
else:
processed.append(e)

return cluster.rebuild(processed)
return cluster.rebuild(exprs=processed, properties=properties)


@cluster_pass(mode='sparse')
Expand Down
15 changes: 13 additions & 2 deletions devito/passes/clusters/blocking.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,14 @@ def _has_atomic_blockable_dim(self, cluster, d):
return any(cluster.properties.is_parallel_atomic(i)
for i in set(cluster.ispace.itdims) - {d})

def _has_enough_large_blockable_dims(self, cluster, d):
return len([i for i in set(cluster.ispace.itdims) - {d}
def _has_enough_large_blockable_dims(self, cluster, d, nested=False):
if nested:
_, ispace = cluster.ispace.split(d)
dims = set(ispace.itdims)
else:
ispace = cluster.ispace
dims = set(cluster.ispace.itdims) - {d}
return len([i for i in dims
if (cluster.properties.is_parallel_relaxed(i) and
not self._has_short_trip_count(i))]) >= 3

Expand All @@ -191,6 +197,11 @@ def callback(self, clusters, prefix):
# to have more than three large blockable Dimensions
return clusters

if self._has_enough_large_blockable_dims(c, d, nested=True):
# Virtually all device programming models forbid parallelism
# along more than three dimensions
return clusters

if any(self._has_short_trip_count(i) for i in c.ispace.itdims):
properties = c.properties.block(d, 'small')
elif self._has_data_reuse(c):
Expand Down
23 changes: 21 additions & 2 deletions tests/test_gpu_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@
from devito import (Constant, Eq, Inc, Grid, Function, ConditionalDimension,
Dimension, MatrixSparseTimeFunction, SparseTimeFunction,
SubDimension, SubDomain, SubDomainSet, TimeFunction,
Operator, configuration, switchconfig, TensorTimeFunction)
Operator, configuration, switchconfig, TensorTimeFunction,
Buffer)
from devito.arch import get_gpu_info
from devito.exceptions import InvalidArgument
from devito.ir import (Conditional, Expression, Section, FindNodes, FindSymbols,
retrieve_iteration_tree)
from devito.passes.iet.languages.openmp import OmpIteration
from devito.types import DeviceID, DeviceRM, Lock, NPThreads, PThreadArray
from devito.types import DeviceID, DeviceRM, Lock, NPThreads, PThreadArray, Symbol

from conftest import skipif

Expand Down Expand Up @@ -147,6 +148,24 @@ def test_incr_perfect_outer(self):
op()
assert np.all(w.data == 10)

def test_reduction_many_dims(self):
grid = Grid(shape=(25, 25, 25))

u = TimeFunction(name='u', grid=grid, time_order=1, save=Buffer(1))
s = Symbol(name='s', dtype=np.float32)

eqns = [Eq(s, 0),
Inc(s, 2*u + 1)]

op0 = Operator(eqns)
op1 = Operator(eqns, opt=('advanced', {'mapify-reduce': True}))

tree, = retrieve_iteration_tree(op0)
assert 'collapse(4) reduction(+:s)' in str(tree.root.pragmas[0])

tree, = retrieve_iteration_tree(op1)
assert 'collapse(3) reduction(+:s)' in str(tree[1].pragmas[0])


class Bundle(SubDomain):
"""
Expand Down

0 comments on commit b22bb1e

Please sign in to comment.