devitocodes · FabioLuporini · Jun 7, 2021 · Mar 17, 2021 · Apr 13, 2021 · Apr 14, 2021
diff --git a/devito/core/cpu.py b/devito/core/cpu.py
@@ -1,12 +1,10 @@
 from functools import partial
 
-import numpy as np
-
 from devito.core.operator import CoreOperator, CustomOperator
 from devito.exceptions import InvalidOperator
 from devito.passes.equations import buffering, collect_derivatives
-from devito.passes.clusters import (Lift, blocking, cire, cse, eliminate_arrays,
-                                    extract_increments, factorize, fuse, optimize_pows)
+from devito.passes.clusters import (Lift, blocking, cire, cse, extract_increments,
+                                    factorize, fuse, optimize_pows)
 from devito.passes.iet import (CTarget, OmpTarget, avoid_denormals, mpiize,
                                optimize_halospots, hoist_prodders, relax_incr_dimensions)
 from devito.tools import timed_pass
@@ -24,17 +22,17 @@ class Cpu64OperatorMixin(object):
     3 => "blocks", "sub-blocks", and "sub-sub-blocks", ...
     """
 
-    CIRE_MINCOST_INV = 50
+    CIRE_MINGAIN = 10
     """
-    Minimum operation count of a Dimension-invariant aliasing expression to be
-    optimized away. Dimension-invariant aliases are lifted outside of one or more
-    invariant loop(s), so they require tensor temporaries that can be potentially
-    very large (e.g., the whole domain in the case of time-invariant aliases).
+    Minimum operation count reduction for a redundant expression to be optimized
+    away. Higher (lower) values make a redundant expression less (more) likely to
+    be optimized away.
     """
 
-    CIRE_MINCOST_SOPS = 10
+    CIRE_SCHEDULE = 'automatic'
     """
-    Minimum operation count of a sum-of-product aliasing expression to be optimized away.
+    Strategy used to schedule derivatives across loops. This impacts the operational
+    intensity of the generated kernel.
     """
 
     PAR_COLLAPSE_NCORES = 4
@@ -88,15 +86,9 @@ def _normalize_kwargs(cls, **kwargs):
         o['min-storage'] = oo.pop('min-storage', False)
         o['cire-rotate'] = oo.pop('cire-rotate', False)
         o['cire-maxpar'] = oo.pop('cire-maxpar', False)
-        o['cire-maxalias'] = oo.pop('cire-maxalias', False)
         o['cire-ftemps'] = oo.pop('cire-ftemps', False)
-        o['cire-mincost'] = {
-            'invariants': {
-                'scalar': np.inf,
-                'tensor': oo.pop('cire-mincost-inv', cls.CIRE_MINCOST_INV),
-            },
-            'sops': oo.pop('cire-mincost-sops', cls.CIRE_MINCOST_SOPS)
-        }
+        o['cire-mingain'] = oo.pop('cire-mingain', cls.CIRE_MINGAIN)
+        o['cire-schedule'] = oo.pop('cire-schedule', cls.CIRE_SCHEDULE)
 
         # Shared-memory parallelism
         o['par-collapse-ncores'] = oo.pop('par-collapse-ncores', cls.PAR_COLLAPSE_NCORES)
@@ -173,18 +165,16 @@ def _specialize_clusters(cls, clusters, **kwargs):
         # Blocking to improve data locality
         clusters = blocking(clusters, options)
 
-        # Reduce flops (potential arithmetic alterations)
+        # Reduce flops
         clusters = extract_increments(clusters, sregistry)
         clusters = cire(clusters, 'sops', sregistry, options, platform)
         clusters = factorize(clusters)
         clusters = optimize_pows(clusters)
 
-        # The previous passes may have created fusion opportunities, which in
-        # turn may enable further optimizations
+        # The previous passes may have created fusion opportunities
         clusters = fuse(clusters)
-        clusters = eliminate_arrays(clusters)
 
-        # Reduce flops (no arithmetic alterations)
+        # Reduce flops
         clusters = cse(clusters, sregistry)
 
         return clusters
@@ -260,10 +250,8 @@ def _specialize_clusters(cls, clusters, **kwargs):
         clusters = factorize(clusters)
         clusters = optimize_pows(clusters)
 
-        # The previous passes may have created fusion opportunities, which in
-        # turn may enable further optimizations
+        # The previous passes may have created fusion opportunities
         clusters = fuse(clusters)
-        clusters = eliminate_arrays(clusters)
 
         # Reduce flops (no arithmetic alterations)
         clusters = cse(clusters, sregistry)

diff --git a/devito/core/gpu.py b/devito/core/gpu.py
@@ -6,8 +6,7 @@
 from devito.exceptions import InvalidOperator
 from devito.passes.equations import collect_derivatives, buffering
 from devito.passes.clusters import (Lift, Streaming, Tasker, blocking, cire, cse,
-                                    eliminate_arrays, extract_increments, factorize,
-                                    fuse, optimize_pows)
+                                    extract_increments, factorize, fuse, optimize_pows)
 from devito.passes.iet import (DeviceOmpTarget, DeviceAccTarget, optimize_halospots,
                                mpiize, hoist_prodders, is_on_device)
 from devito.tools import as_tuple, timed_pass
@@ -26,17 +25,17 @@ class DeviceOperatorMixin(object):
     3 => "blocks", "sub-blocks", and "sub-sub-blocks", ...
     """
 
-    CIRE_MINCOST_INV = 50
+    CIRE_MINGAIN = 10
     """
-    Minimum operation count of a Dimension-invariant aliasing expression to be
-    optimized away. Dimension-invariant aliases are lifted outside of one or more
-    invariant loop(s), so they require tensor temporaries that can be potentially
-    very large (e.g., the whole domain in the case of time-invariant aliases).
+    Minimum operation count reduction for a redundant expression to be optimized
+    away. Higher (lower) values make a redundant expression less (more) likely to
+    be optimized away.
     """
 
-    CIRE_MINCOST_SOPS = 10
+    CIRE_SCHEDULE = 'automatic'
     """
-    Minimum operation count of a sum-of-product aliasing expression to be optimized away.
+    Strategy used to schedule derivatives across loops. This impacts the operational
+    intensity of the generated kernel.
     """
 
     PAR_CHUNK_NONAFFINE = 3
@@ -69,15 +68,9 @@ def _normalize_kwargs(cls, **kwargs):
         o['min-storage'] = False
         o['cire-rotate'] = False
         o['cire-maxpar'] = oo.pop('cire-maxpar', True)
-        o['cire-maxalias'] = oo.pop('cire-maxalias', False)
         o['cire-ftemps'] = oo.pop('cire-ftemps', False)
-        o['cire-mincost'] = {
-            'invariants': {
-                'scalar': 1,
-                'tensor': oo.pop('cire-mincost-inv', cls.CIRE_MINCOST_INV),
-            },
-            'sops': oo.pop('cire-mincost-sops', cls.CIRE_MINCOST_SOPS)
-        }
+        o['cire-mingain'] = oo.pop('cire-mingain', cls.CIRE_MINGAIN)
+        o['cire-schedule'] = oo.pop('cire-schedule', cls.CIRE_SCHEDULE)
 
         # GPU parallelism
         o['par-collapse-ncores'] = 1  # Always use a collapse clause
@@ -156,19 +149,17 @@ def _specialize_clusters(cls, clusters, **kwargs):
         clusters = cire(clusters, 'invariants', sregistry, options, platform)
         clusters = Lift().process(clusters)
 
-        # Reduce flops (potential arithmetic alterations)
+        # Reduce flops
         clusters = extract_increments(clusters, sregistry)
         clusters = cire(clusters, 'sops', sregistry, options, platform)
         clusters = factorize(clusters)
         clusters = optimize_pows(clusters)
 
-        # Reduce flops (no arithmetic alterations)
-        clusters = cse(clusters, sregistry)
-
-        # Lifting may create fusion opportunities, which in turn may enable
-        # further optimizations
+        # The previous passes may have created fusion opportunities
         clusters = fuse(clusters)
-        clusters = eliminate_arrays(clusters)
+
+        # Reduce flops
+        clusters = cse(clusters, sregistry)
 
         return clusters
 

diff --git a/devito/finite_differences/coefficients.py b/devito/finite_differences/coefficients.py
@@ -148,8 +148,8 @@ class Substitutions(object):
     check that by
 
     >>> eq.evaluate
-    Eq(0.1*u(t, x, y) - 0.6*u(t, x - h_x, y) + 0.6*u(t, x + h_x, y) \
-- u(t, x, y)/dt + u(t + dt, x, y)/dt, 0)
+    Eq(-u(t, x, y)/dt + u(t + dt, x, y)/dt + 0.1*u(t, x, y) - \
+0.6*u(t, x - h_x, y) + 0.6*u(t, x + h_x, y), 0)
 
     Notes
     -----

diff --git a/devito/finite_differences/derivative.py b/devito/finite_differences/derivative.py
@@ -7,7 +7,7 @@
 from devito.finite_differences.finite_difference import (generic_derivative,
                                                          first_derivative,
                                                          cross_derivative)
-from devito.finite_differences.differentiable import Differentiable, EvalDerivative
+from devito.finite_differences.differentiable import Differentiable
 from devito.finite_differences.tools import direct, transpose
 from devito.tools import as_mapper, as_tuple, filter_ordered, frozendict
 from devito.types.utils import DimensionTuple
@@ -333,9 +333,6 @@ def _eval_fd(self, expr):
         - 3: Evaluate remaining terms (as `g` may need to be evaluated
         at a different point).
         - 4: Apply substitutions.
-        - 5: Cast to an object of type `EvalDerivative` so that we know
-             the argument stems from a `Derivative. This may be useful for
-             later compilation passes.
         """
         # Step 1: Evaluate derivatives within expression
         expr = getattr(expr, '_eval_deriv', expr)
@@ -359,8 +356,4 @@ def _eval_fd(self, expr):
         for e in self._ppsubs:
             res = res.xreplace(e)
 
-        # Step 5: Cast to EvaluatedDerivative
-        assert res.is_Add
-        res = EvalDerivative(*res.args, evaluate=False)
-
         return res
diff --git a/devito/finite_differences/differentiable.py b/devito/finite_differences/differentiable.py
@@ -2,17 +2,19 @@
 from functools import singledispatch
 
 import sympy
+from sympy.core.add import _addsort
+from sympy.core.mul import _mulsort
 from sympy.core.decorators import call_highest_priority
 from sympy.core.evalf import evalf_table
 
 from cached_property import cached_property
 from devito.finite_differences.tools import make_shift_x0
 from devito.logger import warning
-from devito.tools import filter_ordered, flatten
+from devito.tools import filter_ordered, flatten, split
 from devito.types.lazy import Evaluable
 from devito.types.utils import DimensionTuple
 
-__all__ = ['Differentiable']
+__all__ = ['Differentiable', 'EvalDerivative']
 
 
 class Differentiable(sympy.Expr, Evaluable):
@@ -300,6 +302,11 @@ class DifferentiableOp(Differentiable):
     __sympy_class__ = None
 
     def __new__(cls, *args, **kwargs):
+        # Do not re-evaluate if any of the args is an EvalDerivative,
+        # since the integrity of these objects must be preserved
+        if any(isinstance(i, EvalDerivative) for i in args):
+            kwargs['evaluate'] = False
+
         obj = cls.__base__.__new__(cls, *args, **kwargs)
 
         # Unfortunately SymPy may build new sympy.core objects (e.g., sympy.Add),
@@ -363,12 +370,54 @@ def _eval_at(self, func):
 
 class Add(DifferentiableOp, sympy.Add):
     __sympy_class__ = sympy.Add
-    __new__ = DifferentiableOp.__new__
+
+    def __new__(cls, *args, **kwargs):
+        # Here, often we get `evaluate=False` to prevent SymPy evaluation (e.g.,
+        # when `cls==EvalDerivative`), but in all cases we at least apply a small
+        # set of basic simplifications
+
+        # (a+b)+c -> a+b+c (flattening)
+        nested, others = split(args, lambda e: isinstance(e, Add))
+        args = flatten(e.args for e in nested) + list(others)
+
+        # a+0 -> a
+        args = [i for i in args if i != 0]
+
+        # Reorder for homogeneity with pure SymPy types
+        _addsort(args)
+
+        return super().__new__(cls, *args, **kwargs)
 
 
 class Mul(DifferentiableOp, sympy.Mul):
     __sympy_class__ = sympy.Mul
-    __new__ = DifferentiableOp.__new__
+
+    def __new__(cls, *args, **kwargs):
+        # A Mul, being a DifferentiableOp, may not trigger evaluation upon
+        # construction (e.g., when an EvalDerivative is present among its
+        # arguments), so here we apply a small set of basic simplifications
+        # to avoid generating functional, but also ugly, code
+
+        # (a*b)*c -> a*b*c (flattening)
+        nested, others = split(args, lambda e: isinstance(e, Mul))
+        args = flatten(e.args for e in nested) + list(others)
+
+        # a*0 -> 0
+        if any(i == 0 for i in args):
+            return sympy.S.Zero
+
+        # a*1 -> a
+        args = [i for i in args if i != 1]
+
+        # a*-1*-1 -> a
+        nminus = len([i for i in args if i == sympy.S.NegativeOne])
+        if nminus % 2 == 0:
+            args = [i for i in args if i != sympy.S.NegativeOne]
+
+        # Reorder for homogeneity with pure SymPy types
+        _mulsort(args)
+
+        return super().__new__(cls, *args, **kwargs)
 
     @property
     def _gather_for_diff(self):
@@ -411,17 +460,46 @@ def _gather_for_diff(self):
 class Pow(DifferentiableOp, sympy.Pow):
     _fd_priority = 0
     __sympy_class__ = sympy.Pow
-    __new__ = DifferentiableOp.__new__
 
 
 class Mod(DifferentiableOp, sympy.Mod):
     __sympy_class__ = sympy.Mod
-    __new__ = DifferentiableOp.__new__
 
 
 class EvalDerivative(DifferentiableOp, sympy.Add):
-    __sympy_class__ = sympy.Add
-    __new__ = DifferentiableOp.__new__
+
+    is_commutative = True
+
+    def __new__(cls, *args, base=None, **kwargs):
+        kwargs['evaluate'] = False
+
+        # a+0 -> a
+        args = [i for i in args if i != 0]
+
+        # Reorder for homogeneity with pure SymPy types
+        _addsort(args)
+
+        obj = super().__new__(cls, *args, **kwargs)
+
+        try:
+            obj.base = base
+        except AttributeError:
+            # This might happen if e.g. one attempts a (re)construction with
+            # one sole argument. The (re)constructed EvalDerivative degenerates
+            # to an object of different type, in classic SymPy style. That's fine
+            assert len(args) <= 1
+            assert not obj.is_Add
+            return obj
+
+        return obj
+
+    @property
+    def func(self):
+        return lambda *a, **kw: EvalDerivative(*a, base=self.base, **kw)
+
+    def _new_rawargs(self, *args, **kwargs):
+        kwargs.pop('is_commutative', None)
+        return self.func(*args, **kwargs)
 
 
 class diffify(object):
@@ -502,6 +580,9 @@ def _diff2sympy(obj):
         except AttributeError:
             # Not of type DifferentiableOp
             pass
+        except TypeError:
+            # Won't lower (e.g., EvalDerivative)
+            pass
         if flag:
             return obj.func(*args, evaluate=False), True
         else:

diff --git a/devito/finite_differences/finite_difference.py b/devito/finite_differences/finite_difference.py
@@ -188,8 +188,8 @@ def cross_derivative(expr, dims, fd_order, deriv_order, **kwargs):
     >>> f = Function(name='f', grid=grid, space_order=2)
     >>> g = Function(name='g', grid=grid, space_order=2)
     >>> cross_derivative(f*g, dims=(x, y), fd_order=(2, 2), deriv_order=(1, 1))
-    -(-f(x, y)*g(x, y)/h_x + f(x + h_x, y)*g(x + h_x, y)/h_x)/h_y +\
- (-f(x, y + h_y)*g(x, y + h_y)/h_x + f(x + h_x, y + h_y)*g(x + h_x, y + h_y)/h_x)/h_y
+    (-1/h_y)*(-f(x, y)*g(x, y)/h_x + f(x + h_x, y)*g(x + h_x, y)/h_x) + \
+(-f(x, y + h_y)*g(x, y + h_y)/h_x + f(x + h_x, y + h_y)*g(x + h_x, y + h_y)/h_x)/h_y
 
     Semantically, this is equivalent to
 
@@ -200,15 +200,15 @@ def cross_derivative(expr, dims, fd_order, deriv_order, **kwargs):
     The expanded form is obtained via ``evaluate``
 
     >>> (f*g).dxdy.evaluate
-    -(-f(x, y)*g(x, y)/h_x + f(x + h_x, y)*g(x + h_x, y)/h_x)/h_y +\
- (-f(x, y + h_y)*g(x, y + h_y)/h_x + f(x + h_x, y + h_y)*g(x + h_x, y + h_y)/h_x)/h_y
+    (-1/h_y)*(-f(x, y)*g(x, y)/h_x + f(x + h_x, y)*g(x + h_x, y)/h_x) + \
+(-f(x, y + h_y)*g(x, y + h_y)/h_x + f(x + h_x, y + h_y)*g(x + h_x, y + h_y)/h_x)/h_y
 
     Finally the x0 argument allows to choose the origin of the finite-difference
 
     >>> cross_derivative(f*g, dims=(x, y), fd_order=(2, 2), deriv_order=(1, 1), \
     x0={x: 1, y: 2})
-    -(-f(1, 2)*g(1, 2)/h_x + f(h_x + 1, 2)*g(h_x + 1, 2)/h_x)/h_y +\
- (-f(1, h_y + 2)*g(1, h_y + 2)/h_x + f(h_x + 1, h_y + 2)*g(h_x + 1, h_y + 2)/h_x)/h_y
+    (-1/h_y)*(-f(1, 2)*g(1, 2)/h_x + f(h_x + 1, 2)*g(h_x + 1, 2)/h_x) + (-f(1, h_y + 2)*\
+g(1, h_y + 2)/h_x + f(h_x + 1, h_y + 2)*g(h_x + 1, h_y + 2)/h_x)/h_y
     """
     x0 = kwargs.get('x0', {})
     for d, fd, dim in zip(deriv_order, fd_order, dims):
@@ -282,6 +282,6 @@ def indices_weights_to_fd(expr, dim, inds, weights, matvec=1):
         c = sympify(c).evalf(_PRECISION)
         terms.append(expr._subs(dim, iloc - (expr.indices_ref[dim] - dim)) * c)
 
-    deriv = EvalDerivative(*terms)
+    deriv = EvalDerivative(*terms, base=expr)
 
     return deriv