Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions docs/src/man/automatic_differentiation.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,63 @@ julia> E_sym = hessian(ψ, rand(Tensor{2,2}));
julia> norm(majorsymmetric(E) - E_sym)
0.0
```

## Inserting a known derivative
When conditionals are used in a function evaluation, automatic differentiation
may yield the wrong result. Consider, the simplified example of the function
`f(x) = is_zero(x) ? zero(x) : sin(x)`. If evaluated at `x=0`, the returning
of `zero(x)` gives a zero derivative because `zero(x)` is constant, while the
correct value is 1. In such cases, it is possible to insert a known
derivative of a function which is part of a larger function to be
automatically differentiated.

Another use case is when the analytical derivative can be computed much more
efficiently than the automatically differentiatiated derivative.

```@docs
@implement_gradient
```

### Example
Lets consider the function ``h(\mathbf{f}(\mathbf{g}(\mathbf{x})))``
where `h(x)=norm(x)`, `f(x)=x ⋅ x`, and `g(x)=dev(x)`. For `f(x)` we
then have the analytical derivative
```math
\frac{\partial f_{ij}}{\partial x_{kl}} = \delta_{ik} x_{lj} + x_{ik} \delta_{jl}
```
which we can insert into our known analytical derivative using the
`@implement_gradient` macro. Below, we compare with the result when
the full derivative is calculated using automatic differentiation.

```jldoctest
# Define functions
h(x) = norm(x)
f1(x) = x ⋅ x
f2(x) = f1(x)
g(x) = dev(x)

# Define composed functions
cfun1(x) = h(f1(g(x)))
cfun2(x) = h(f2(g(x)))

# Define known derivative
function df2dx(x::Tensor{2,dim}) where{dim}
println("Hello from df2dx") # Show that df2dx is called
fval = f2(x)
I2 = one(Tensor{2,dim})
dfdx_val = otimesu(I2, transpose(x)) + otimesu(x, I2)
return fval, dfdx_val
end

# Implement known derivative
@implement_gradient f2 df2dx

# Calculate gradients
x = rand(Tensor{2,2})

gradient(cfun1, x) ≈ gradient(cfun2, x)

# output
Hello from df2dx
true
```
1 change: 1 addition & 0 deletions src/Tensors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ export otimesu, otimesl
export minortranspose, majortranspose, isminorsymmetric, ismajorsymmetric
export tdot, dott, dotdot
export hessian, gradient, curl, divergence, laplace
export @implement_gradient
export basevec, eᵢ
export rotate
export tovoigt, tovoigt!, fromvoigt, tomandel, tomandel!, frommandel
Expand Down
149 changes: 149 additions & 0 deletions src/automatic_differentiation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ end
####################

# Scalar output -> Scalar value
"""
function _extract_value(v::ForwardDiff.Dual)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this (and _insert_gradient) are documented, perhaps remove the underscore?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They are documented, but I didn't include them in the docs.
(As I documented them before using your @implement_gradient idea).
Should I then remove underscore, include in docs, and export them, or just remove underscore?

function _extract_value(v::AbstractTensor{<:Any,<:Any,<:Dual})

Extract the non-dual part of a tensor with dual entries. This
function is useful when inserting analytical derivatives using
the [`_insert_gradient`](@ref) function
"""
@inline function _extract_value(v::Dual)
return value(v)
end
Expand Down Expand Up @@ -186,6 +194,147 @@ for TensorType in (Tensor, SymmetricTensor)
end
end

######################
# Gradient insertion #
######################

# Insertions get the real value and derivative of a function, as well
# a tensor of dual values that was the initial input to that function.
# A new tensor of dual values are then created, to emulate the function
# being run with dual numbers (i.e. inserting the analytical gradient)
# As opposed to with gradient extraction, we don't have the original input
# (scalar or tensor) to the gradient function. But we can create this based
# on the tag created in the `gradient` function.
# Specifically, consider a function y=f(g(x)) where we want to supply the
# derivative df/dg (at g(x)). We then have "dy/dx = df/dg dg/dx" where
# the type of product is given by the type of g:
# g is 0th order: open product ^1
# g is 1st order: single contraction
# g is 2nd order: double contraction
#
# ^1: Regular multiplication for scalars, but in case x and f
# are vectors, then it is open product.
#
# Support is given for the following function configurations
# g (input) f (output) dfdg (derivative)
# 2nd order 0th order 2nd order
# 1st order 1st order 2nd order
# 2nd order 2nd order 4th order

# First, we define the API macro used to supply the analytical derivative
"""
@implement_gradient(f, f_dfdx)

This macro allows specifying a function `f_dfdx` that provides an analytical
derivative of the function `f`, and is invoked when `f` is differentiated
using automatic differentiation based on `ForwardDiff.jl`
(e.g. when using `Tensors.jl`'s
[`gradient`](@ref) or [`hessian`](@ref)), or one of `ForwardDiff.jl`'s API).
The function `f_dfdx` must take
the same argument as `f` and should return both the value of `f` and
the gradient, i.e. `fval, dfdx_val = f_dfdx(x)`. The following combinations
of input and output types are supported:

| `x` | `f(x)` | `dfdx` |
|:--------------------|:--------------------|:--------------------|
| `Number` | `Number` | `Number` |
| `Number` | `Vec` | `Vec` |
| `Number` | `SecondOrderTensor` | `SecondOrderTensor` |
| `Vec` | `Number` | `Vec` |
| `Vec` | `Vec` | `Tensor{2}` |
| `SecondOrderTensor` | `Number` | `SecondOrderTensor` |
| `SecondOrderTensor` | `SecondOrderTensor` | `FourthOrderTensor` |

Note that if one tensor if of symmetric type, then all tensors must
be of symmetric type

"""
macro implement_gradient(f, f_dfdx)
return :($(esc(f))(x :: Union{AbstractTensor{<:Any, <:Any, <:Dual}, Dual}) = _propagate_gradient($(esc(f_dfdx)), x))
end
# which calls the general function _propagate_gradient that calls the specialized _insert_gradient method below
function _propagate_gradient(f_dfdx::Function, x::Union{AbstractTensor{<:Any, <:Any, <:Dual}, Dual})
fval, dfdx_val = f_dfdx(_extract_value(x))
return _insert_gradient(fval, dfdx_val, x)
end

# Define the _insert_gradient method
"""
_insert_gradient(f::Union{Number,AbstractTensor}, dfdg::Union{Number,AbstractTensor}, g::ForwardDiff.Dual)
_insert_gradient(f::Union{Number,AbstractTensor}, dfdg::Union{Number,AbstractTensor}, g::Vec{<:Any,<:ForwardDiff.Dual})
_insert_gradient(f::Union{Number,AbstractTensor}, dfdg::Union{Number,AbstractTensor}, g::SecondOrderTensor{<:Any,<:ForwardDiff.Dual})

Allows inserting an analytical gradient for use with automatic differentiation.
Consider a composed function ``h(f(g(x)))``, where you have an efficient way to
calculate ``\\partial f/\\partial g``, but want to use automatic
differentiation for the other functions. Then, you can make another definition
of ``f(g)`` to dispatch on if ``g`` is a tensor with `ForwardDiff.Dual`
entires, i.e.
```julia
function f(g::Tensor{2,dim,T}) where{dim, T<:ForwardDiff.Dual}
gval = _extract_value(g) # Get the non-dual tensor value
fval = f(gval) # Calculate function value
dfdg = dfdg_analytical(fval, gval) # Calculate analytical derivative
return _insert_gradient(fval, dfdg, g) # Return the updated dual tensor
end
```

"""
function _insert_gradient(f::Union{Number,AbstractTensor}, dfdg::Union{Number,AbstractTensor}, g::Dual{Tg}) where{Tg}
dgdx = _extract_gradient(g, _get_original_gradient_input(g))
dfdx = dfdg ⊗ dgdx
return _insert_full_gradient(f, dfdx, Tg())
end

function _insert_gradient(f::Union{Number,AbstractTensor}, dfdg::Union{Number,AbstractTensor}, g::Vec{<:Any, <:Dual{Tg}}) where{Tg}
dgdx = _extract_gradient(g, _get_original_gradient_input(g))
dfdx = dfdg ⋅ dgdx
return _insert_full_gradient(f, dfdx, Tg())
end

function _insert_gradient(f::Union{Number,AbstractTensor}, dfdg::Union{Number,AbstractTensor}, g::SecondOrderTensor{<:Any,<:Dual{Tg}}) where{Tg}
dgdx = _extract_gradient(g, _get_original_gradient_input(g))
dfdx = dfdg ⊡ dgdx
return _insert_full_gradient(f, dfdx, Tg())
end

# Define helper function to figure out original input to gradient function
_get_original_gradient_input(::Dual{Tag{Tf,Tv}}) where{Tf,Tv} = zero(Tv)
_get_original_gradient_input(::AbstractTensor{<:Any,<:Any,<:Dual{Tag{Tf,Tv}}}) where{Tf,Tv} = zero(Tv)

# Define helper function to insert_the_full_gradient calculated in _insert_gradient
_insert_full_gradient(f::Number, dfdx::Number, ::Tg) where{Tg} = Dual{Tg}(f, dfdx)
_insert_full_gradient(f::Number, dfdx::AbstractTensor, ::Tg) where{Tg} = Dual{Tg}(f, get_data(dfdx))

function _insert_full_gradient(f::TT, dfdx::TT, ::Tg) where{TT<:AbstractTensor,Tg}
fdata = get_data(f)
diffdata = get_data(dfdx)
TTb = get_base(TT)
@inbounds y = TTb(ntuple(i -> Dual{Tg}(fdata[i], diffdata[i]), length(fdata)))
return y
end

function _insert_full_gradient(f::Vec{dim}, dfdx::Tensor{2,dim}, ::Tg) where{dim, Tg}
fdata = get_data(f)
diffdata = get_data(dfdx)
@inbounds y = Vec{dim}(i -> Dual{Tg}(fdata[i], ntuple(j->diffdata[i+dim*(j-1)], dim)))
return y
end

function _insert_full_gradient(f::Tensor{2,dim,<:Any,N}, dfdx::Tensor{4,dim}, ::Tg) where{dim, N, Tg}
fdata = get_data(f)
diffdata = get_data(dfdx)
@inbounds y = Tensor{2,dim}(ntuple(i->Dual{Tg}(fdata[i], ntuple(j->diffdata[i+N*(j-1)],N)), N))
return y
end
function _insert_full_gradient(f::SymmetricTensor{2,dim,<:Any,N}, dfdx::SymmetricTensor{4,dim}, ::Tg) where{dim, N, Tg}
fdata = get_data(f)
diffdata = get_data(dfdx)
@inbounds y = SymmetricTensor{2,dim}(ntuple(i->Dual{Tg}(fdata[i], ntuple(j->diffdata[i+N*(j-1)],N)), N))
return y
end


##################
# Load functions #
##################
Expand Down
4 changes: 4 additions & 0 deletions src/tensor_products.jl
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,10 @@ end
TensorType(@inline function(i,j,k,l) @inbounds S1[i,j] * S2[k,l]; end)
end

@inline otimes(S1::Number, S2::Number) = S1*S2
@inline otimes(S1::AbstractTensor, S2::Number) = S1*S2
@inline otimes(S1::Number, S2::AbstractTensor) = S1*S2

const ⊗ = otimes

"""
Expand Down
142 changes: 142 additions & 0 deletions test/test_ad.jl
Original file line number Diff line number Diff line change
Expand Up @@ -187,4 +187,146 @@ S(C) = S(C, μ, Kb)
@test gradient(y -> gradient(x -> f(y, x), s), v)::Vec{2} ≈ Vec((v[2], v[1]))
@test gradient(y -> gradient(x -> f(x, y), v), s)::Vec{2} ≈ Vec((v[2], v[1]))
end


@testsection "analytical gradient implementation" begin
# Consider the function f(g(x)), we need to test the following variations to cover all cases
# * x::Number
# - g::Number, f:: Number, Vec, Tensor{2}, SymmetricTensor{2}
# - g::Vec, f:: Number, Vec
# - g::Tensor{2}, f:: Number, Tensor{2}
# - g::SymmetricTensor{2}, f:: Number, SymmetricTensor{2}
# * x::Vec
# - g::Number, f:: Number, Vec
# - g::Vec, f:: Number, Vec
# * x::Tensor{2}
# - g::Number, f:: Number, Tensor{2}
# - g::Tensor{2}, f:: Number, Tensor{2}
# * x::SymmetricTensor{2}
# - g::Number, f:: Number, SymmetricTensor{2}
# - g::SymmetricTensor{2}, f:: Number, SymmetricTensor{2}
#
# Define the different types of functions required to test the cases above. Naming convention:
# tm_tn: Function taking in tensor of order n and outputting tensor of order m. (Number=0, SymmetricTensor{2}=s)
# _ana: Suffix for the function for which the analytical derivative is implemented for
# Define this function with AbstractFloat input to give error if not properly overloaded for Dual
# as `isa(ForwardDiff.Dual(1, 1), AbstractFloat) = false`
for dim = 1:3

a0 = rand(); a1 = rand(Vec{dim}); a2 = rand(Tensor{2,dim}); as = rand(SymmetricTensor{2,dim})
x0 = rand(); x1 = rand(Vec{dim}); x2 = rand(Tensor{2,dim}); xs = rand(SymmetricTensor{2,dim})

# Scalar input (t0)
t0_t0(x) = sin(x); t0_t0_ana(x::AbstractFloat) = t0_t0(x); d_t0_t0(x_) = reverse(gradient(t0_t0, x_, :all)); @implement_gradient t0_t0_ana d_t0_t0
t1_t0(x) = x*a1; t1_t0_ana(x::AbstractFloat) = t1_t0(x); d_t1_t0(x_) = reverse(gradient(t1_t0, x_, :all)); @implement_gradient t1_t0_ana d_t1_t0
t2_t0(x) = x*a2; t2_t0_ana(x::AbstractFloat) = t2_t0(x); d_t2_t0(x_) = reverse(gradient(t2_t0, x_, :all)); @implement_gradient t2_t0_ana d_t2_t0
ts_t0(x) = x*as; ts_t0_ana(x::AbstractFloat) = ts_t0(x); d_ts_t0(x_) = reverse(gradient(ts_t0, x_, :all)); @implement_gradient ts_t0_ana d_ts_t0

@test gradient(t0_t0, x0) ≈ gradient(t0_t0_ana, x0)
@test gradient(t1_t0, x0) ≈ gradient(t1_t0_ana, x0)
@test gradient(t2_t0, x0) ≈ gradient(t2_t0_ana, x0)
@test gradient(ts_t0, x0) ≈ gradient(ts_t0_ana, x0)

# Vector input (t1)
t0_t1(x) = norm(x); t0_t1_ana(x::Vec{<:Any,<:AbstractFloat}) = t0_t1(x); d_t0_t1(x_) = reverse(gradient(t0_t1, x_, :all)); @implement_gradient t0_t1_ana d_t0_t1
t1_t1(x) = a0*x; t1_t1_ana(x::Vec{<:Any,<:AbstractFloat}) = t1_t1(x); d_t1_t1(x_) = reverse(gradient(t1_t1, x_, :all)); @implement_gradient t1_t1_ana d_t1_t1

@test gradient(t0_t1, x1) ≈ gradient(t0_t1_ana, x1)
@test gradient(t1_t1, x1) ≈ gradient(t1_t1_ana, x1)

# 2nd order tensor input (t2)
t0_t2(x) = norm(x); t0_t2_ana(x::Tensor{2,<:Any,<:AbstractFloat}) = t0_t2(x); d_t0_t2(x_) = reverse(gradient(t0_t2, x_, :all)); @implement_gradient t0_t2_ana d_t0_t2
t2_t2(x) = dot(x,x); t2_t2_ana(x::Tensor{2,<:Any,<:AbstractFloat}) = t2_t2(x); d_t2_t2(x_) = reverse(gradient(t2_t2, x_, :all)); @implement_gradient t2_t2_ana d_t2_t2

@test gradient(t0_t2, x2) ≈ gradient(t0_t2_ana, x2)
@test gradient(t2_t2, x2) ≈ gradient(t2_t2_ana, x2)

# 2nd order symmetric tensor input (ts)
t0_ts(x) = norm(x); t0_ts_ana(x::SymmetricTensor{2,<:Any,<:AbstractFloat}) = t0_ts(x); d_t0_ts(x_) = reverse(gradient(t0_ts, x_, :all)); @implement_gradient t0_ts_ana d_t0_ts
ts_ts(x) = dot(x); ts_ts_ana(x::SymmetricTensor{2,<:Any,<:AbstractFloat}) = ts_ts(x); d_ts_ts(x_) = reverse(gradient(ts_ts, x_, :all)); @implement_gradient ts_ts_ana d_ts_ts

@test gradient(t0_ts, xs) ≈ gradient(t0_ts_ana, xs)
@test gradient(ts_ts, xs) ≈ gradient(ts_ts_ana, xs)

# Test combined functions. The important is that another function is called first!
# f(g(x)): naming "f_g_x"
# x scalar, g scalar, f: scalar,Vec,Tensor{2},SymmetricTensor{2}:
t0_t0_t0(x) = t0_t0(t0_t0(x)); t0_t0_t0_ana(x) = t0_t0_ana(t0_t0(x))
t1_t0_t0(x) = t1_t0(t0_t0(x)); t1_t0_t0_ana(x) = t1_t0_ana(t0_t0(x))
t2_t0_t0(x) = t2_t0(t0_t0(x)); t2_t0_t0_ana(x) = t2_t0_ana(t0_t0(x))
ts_t0_t0(x) = ts_t0(t0_t0(x)); ts_t0_t0_ana(x) = ts_t0_ana(t0_t0(x))

@test gradient(t0_t0_t0, x0) ≈ gradient(t0_t0_t0_ana, x0)
@test gradient(t1_t0_t0, x0) ≈ gradient(t1_t0_t0_ana, x0)
@test gradient(t2_t0_t0, x0) ≈ gradient(t2_t0_t0_ana, x0)
@test gradient(ts_t0_t0, x0) ≈ gradient(ts_t0_t0_ana, x0)

# f(g(x)): x scalar, g vector, f: scalar,vector
t0_t1_t0(x) = t0_t1(t1_t0(x)); t0_t1_t0_ana(x) = t0_t1_ana(t1_t0(x))
t1_t1_t0(x) = t1_t1(t1_t0(x)); t1_t1_t0_ana(x) = t1_t1_ana(t1_t0(x))

@test gradient(t0_t1_t0, x0) ≈ gradient(t0_t1_t0_ana, x0)
@test gradient(t1_t1_t0, x0) ≈ gradient(t1_t1_t0_ana, x0)

# f(g(x)): x scalar, g Tensor{2}, f:scalar, Tensor{2}
t0_t2_t0(x) = t0_t2(t2_t0(x)); t0_t2_t0_ana(x) = t0_t2_ana(t2_t0(x))
t2_t2_t0(x) = t2_t2(t2_t0(x)); t2_t2_t0_ana(x) = t2_t2_ana(t2_t0(x));

@test gradient(t0_t2_t0, x0) ≈ gradient(t0_t2_t0_ana, x0)
@test gradient(t2_t2_t0, x0) ≈ gradient(t2_t2_t0_ana, x0)

# f(g(x)): x scalar, g SymmetricTensor{2}, f:scalar, SymmetricTensor{2}
t0_ts_t0(x) = t0_ts(ts_t0(x)); t0_ts_t0_ana(x) = t0_ts_ana(ts_t0(x))
ts_ts_t0(x) = ts_ts(ts_t0(x)); ts_ts_t0_ana(x) = ts_ts_ana(ts_t0(x))

@test gradient(t0_ts_t0, x0) ≈ gradient(t0_ts_t0_ana, x0)
@test gradient(ts_ts_t0, x0) ≈ gradient(ts_ts_t0_ana, x0)

# x Vec, g scalar, f: scalar, Vec
t0_t0_t1(x) = t0_t0(t0_t1(x)); t0_t0_t1_ana(x) = t0_t0_ana(t0_t1(x))
t1_t0_t1(x) = t1_t0(t0_t1(x)); t1_t0_t1_ana(x) = t1_t0_ana(t0_t1(x))

@test gradient(t1_t0_t1, x1) ≈ gradient(t1_t0_t1_ana, x1)
@test gradient(t1_t0_t1, x1) ≈ gradient(t1_t0_t1_ana, x1)

# x Vec, g Vec, f: scalar, Vec
t0_t1_t1(x) = t0_t1(t1_t1(x)); t0_t1_t1_ana(x) = t0_t1_ana(t1_t1(x))
t1_t1_t1(x) = t1_t1(t1_t1(x)); t1_t1_t1_ana(x) = t1_t1_ana(t1_t1(x))

@test gradient(t0_t1_t1, x1) ≈ gradient(t0_t1_t1_ana, x1)
@test gradient(t1_t1_t1, x1) ≈ gradient(t1_t1_t1_ana, x1)

# x Tensor{2}, g scalar, f: scalar, Tensor{2}
t0_t0_t2(x) = t0_t0(t0_t1(x)); t0_t0_t2_ana(x) = t0_t0_ana(t0_t2(x))
t2_t0_t2(x) = t2_t0(t0_t2(x)); t2_t0_t2_ana(x) = t2_t0_ana(t0_t2(x))

@test gradient(t0_t0_t2, x2) ≈ gradient(t0_t0_t2_ana, x2)
@test gradient(t2_t0_t2, x2) ≈ gradient(t2_t0_t2_ana, x2)

# x Tensor{2}, g Tensor{2}, f: scalar, Tensor{2}
t0_t2_t2(x) = t0_t2(t2_t2(x)); t0_t2_t2_ana(x) = t0_t2_ana(t2_t2(x))
t2_t2_t2(x) = t2_t2(t2_t2(x)); t2_t2_t2_ana(x) = t2_t2_ana(t2_t2(x))

@test gradient(t0_t2_t2, x2) ≈ gradient(t0_t2_t2_ana, x2)
@test gradient(t2_t2_t2, x2) ≈ gradient(t2_t2_t2_ana, x2)

# x SymmetricTensor{2}, g scalar, f: scalar, SymmetricTensor{2}
t0_t0_ts(x) = t0_t0(t0_t1(x)); t0_t0_ts_ana(x) = t0_t0_ana(t0_ts(x))
ts_t0_ts(x) = ts_t0(t0_ts(x)); ts_t0_ts_ana(x) = ts_t0_ana(t0_ts(x))

@test gradient(t0_t0_ts, xs) ≈ gradient(t0_t0_ts_ana, xs)
@test gradient(ts_t0_ts, xs) ≈ gradient(ts_t0_ts_ana, xs)

# x SymmetricTensor{2}, g SymmetricTensor{2}, f: scalar, SymmetricTensor{2}
t0_ts_ts(x) = t0_ts(ts_ts(x)); t0_ts_ts_ana(x) = t0_ts_ana(ts_ts(x))
ts_ts_ts(x) = ts_ts(ts_ts(x)); ts_ts_ts_ana(x) = ts_ts_ana(ts_ts(x))

@test gradient(t0_ts_ts, xs) ≈ gradient(t0_ts_ts_ana, xs)
@test gradient(ts_ts_ts, xs) ≈ gradient(ts_ts_ts_ana, xs)

end

end


end # testsection