Merge pull request #485 from TuringLang/flux-support

yebai · web-flow · commit a307f82e8e25 · 2018-09-04T10:23:27.000+01:00
Flux support (#457)
diff --git a/REQUIRE b/REQUIRE
@@ -5,6 +5,8 @@ Markdown
 Distributions 0.11.0
 ForwardDiff
 MCMCChain 0.1.0
+Flux
+Stan
 
 ProgressMeter
 
diff --git a/src/Turing.jl b/src/Turing.jl
@@ -16,25 +16,21 @@ using LinearAlgebra
 using ProgressMeter
 using Markdown
 
-@init @require Stan="682df890-35be-576f-97d0-3d8c8b33a550" begin
+#  @init @require Stan="682df890-35be-576f-97d0-3d8c8b33a550" begin
   using Stan
   import Stan: Adapt, Hmc
-end
-@init @require ReverseDiff="37e2e3b7-166d-5795-8a7a-e32c996b4267" begin
-  using ReverseDiff: GradientTape, GradientConfig, gradient!, compile, TrackedArray
-  import ReverseDiff: gradient
-end
-
+#  end
 import Base: ~, convert, promote_rule, rand, getindex, setindex!
 import Distributions: sample
 import ForwardDiff: gradient
+using Flux: Tracker
 import MCMCChain: AbstractChains, Chains
 
 ##############################
 # Global variables/constants #
 ##############################
 
-global ADBACKEND = :forward_diff
+global ADBACKEND = :reverse_diff
 setadbackend(backend_sym) = begin
   @assert backend_sym == :forward_diff || backend_sym == :reverse_diff
   global ADBACKEND = backend_sym
diff --git a/src/core/ad.jl b/src/core/ad.jl
@@ -79,6 +79,8 @@ gradient(vi::VarInfo, model::Function, spl::Union{Nothing, Sampler}) = begin
     spl.info[:grad_cache][θ_hash] = grad
   end
 
+  vi.logp = realpart(vi.logp)
+
   grad
 end
 
@@ -93,8 +95,7 @@ verifygrad(grad::Vector{Float64}) = begin
 end
 
 # Direct call of ForwardDiff.gradient; this is slow
-
-gradient2(_vi::VarInfo, model::Function, spl::Union{Nothing, Sampler}) = begin
+gradient_slow(_vi::VarInfo, model::Function, spl::Union{Nothing, Sampler}) = begin
 
   vi = deepcopy(_vi)
 
@@ -108,38 +109,19 @@ gradient2(_vi::VarInfo, model::Function, spl::Union{Nothing, Sampler}) = begin
   g(vi[spl])
 end
 
-@init @require ReverseDiff="37e2e3b7-166d-5795-8a7a-e32c996b4267" begin
-
-gradient_r(theta::Vector{Float64}, vi::VarInfo, model::Function) = gradient_r(theta, vi, model, nothing)
-gradient_r(theta::Vector{Float64}, vi::Turing.VarInfo, model::Function, spl::Union{Nothing, Sampler}) = begin
-    inputs = (theta)
-
-    if Turing.ADSAFE || (spl == nothing || length(spl.info[:reverse_diff_cache]) == 0)
-        f_r(ipts) = begin
-          vi[spl][:] = ipts[:]
-          -runmodel(model, vi, spl).logp
-        end
-        gtape = GradientTape(f_r, inputs)
-        ctape = compile(gtape)
-        res = (similar(theta))
-
-        if spl != nothing
-          spl.info[:reverse_diff_cache][:ctape] = ctape
-          spl.info[:reverse_diff_cache][:res] = res
-        end
-    else
-        ctape = spl.info[:reverse_diff_cache][:ctape]
-        res = spl.info[:reverse_diff_cache][:res]
-    end
-
-    grad = ReverseDiff.gradient!(res, ctape, inputs)
-
-    # grad = ReverseDiff.gradient(x -> (vi[spl] = x; -runmodel(model, vi, spl).logp), inputs)
-
-    # vi[spl] = realpart(vi[spl])
-    # vi.logp = 0
-
-    grad
+gradient_r(theta::AbstractVector{<:Real}, vi::VarInfo, model::Function) = 
+  gradient_r(theta, vi, model, nothing)
+gradient_r(theta::AbstractVector{<:Real}, vi::Turing.VarInfo, model::Function, spl::Union{Nothing, Sampler}) = begin
+  # Use Flux.Tracker to get gradient
+  grad = Tracker.gradient(x -> (vi[spl] = x; -runmodel(model, vi, spl).logp), theta)
+  # Clean tracked numbers 
+  # Numbers do not need to be tracked between two gradient calls
+  vi.logp = vi.logp.data
+  vi_spl = vi[spl]
+  for i = 1:length(theta)
+    vi_spl[i] = vi_spl[i].data
+  end
+  # Return non-tracked graident value
+  return first(grad).data
 end
 
-end
diff --git a/src/core/util.jl b/src/core/util.jl
@@ -4,10 +4,6 @@
 
 @inline invlogit(x::Union{T,Vector{T},Matrix{T}}) where T<:Real = one(T) ./ (one(T) .+ exp.(-x))
 @inline logit(x::Union{T,Vector{T},Matrix{T}}) where T<:Real = log.(x ./ (one(T) - x))
-@init @require ReverseDiff="37e2e3b7-166d-5795-8a7a-e32c996b4267" begin
-  @inline invlogit(x::TrackedArray) = one(Real) ./ (one(Real) + exp.(-x))
-  @inline logit(x::TrackedArray) = log.(x ./ (one(Real) - x))
-end
 
 # More stable, faster version of rand(Categorical)
 function randcat(p::Vector{Float64})
diff --git a/src/helper.jl b/src/helper.jl
@@ -10,14 +10,14 @@
 @inline realpart(ds::Matrix{Any}) = [realpart(col) for col in ds]
 @inline realpart(ds::Array)  = map(d -> realpart(d), ds)  # NOTE: this function is not optimized
 # @inline realpart(ds::TArray) = realpart(Array(ds))    # TODO: is it disabled temporary
-@init @require ReverseDiff="37e2e3b7-166d-5795-8a7a-e32c996b4267" begin
-  @inline realpart(ta::ReverseDiff.TrackedReal) = ta.value
-end
+@inline realpart(ta::Tracker.TrackedReal) = ta.data
 
 @inline dualpart(d::ForwardDiff.Dual)       = d.partials.values
 @inline dualpart(ds::Union{Array,SubArray}) = map(d -> dualpart(d), ds)
 
 # Base.promote_rule(D1::Type{Real}, D2::Type{ForwardDiff.Dual}) = D2
+import Base: <=
+<=(a::Tracker.TrackedReal, b::Tracker.TrackedReal) = a.data <= b.data
 
 #####################################################
 # Helper functions for vectorize/reconstruct values #
diff --git a/src/samplers/hmc.jl b/src/samplers/hmc.jl
@@ -41,10 +41,10 @@ end
 # Please see https://github.com/TuringLang/Turing.jl/pull/459 for explanations
 DEFAULT_ADAPT_CONF_TYPE = Nothing
 STAN_DEFAULT_ADAPT_CONF = nothing
-@init @require Stan="682df890-35be-576f-97d0-3d8c8b33a550" begin
+#  @init @require Stan="682df890-35be-576f-97d0-3d8c8b33a550" begin
   DEFAULT_ADAPT_CONF_TYPE = Union{DEFAULT_ADAPT_CONF_TYPE,Stan.Adapt}
   STAN_DEFAULT_ADAPT_CONF = Stan.Adapt()
-end
+#  end
 
 # NOTE: the implementation of HMC is removed,
 #       it now reuses the one of HMCDA
@@ -181,6 +181,9 @@ assume(spl::Sampler{T}, dist::Distribution, vn::VarName, vi::VarInfo) where T<:H
   r = vi[vn]
   # acclogp!(vi, logpdf_with_trans(dist, r, istrans(vi, vn)))
   # r
+  @debug "dist = $dist"
+  @debug "vn = $vn"
+  @debug "r = $r" "typeof(r)=$(typeof(r))"
   r, logpdf_with_trans(dist, r, istrans(vi, vn))
 end
 
diff --git a/src/samplers/hmcda.jl b/src/samplers/hmcda.jl
@@ -112,8 +112,6 @@ function step(model, spl::Sampler{HMCDA}, vi::VarInfo, is_first::Bool)
       push!(spl.info[:accept_his], false)
 
       # Reset Θ
-      # NOTE: ForwardDiff and ReverseDiff need different implementation
-      #       due to struct Dual vs mutable TrackedReal
       if ADBACKEND == :forward_diff
 
         vi[spl] = θ
@@ -122,11 +120,7 @@ function step(model, spl::Sampler{HMCDA}, vi::VarInfo, is_first::Bool)
 
         vi_spl = vi[spl]
         for i = 1:length(θ)
-          if isa(vi_spl[i], ReverseDiff.TrackedReal)
-            vi_spl[i].value = θ[i]
-          else
-            vi_spl[i] = θ[i]
-          end
+          vi_spl[i] = θ[i]
         end
 
       end
diff --git a/src/samplers/sampler.jl b/src/samplers/sampler.jl
@@ -44,6 +44,7 @@ assume(spl::Nothing, dist::Distribution, vn::VarName, vi::VarInfo) = begin
   # NOTE: The importance weight is not correctly computed here because
   #       r is genereated from some uniform distribution which is different from the prior
   # acclogp!(vi, logpdf_with_trans(dist, r, istrans(vi, vn)))
+
   r, logpdf_with_trans(dist, r, istrans(vi, vn))
 end
 
@@ -90,6 +91,9 @@ end
 
 observe(spl::Nothing, dist::Distribution, value::Any, vi::VarInfo) = begin
   vi.num_produce += 1
+  @debug "dist = $dist"
+  @debug "value = $value"
+
   # acclogp!(vi, logpdf(dist, value))
   logpdf(dist, value)
 end
diff --git a/src/samplers/support/hmc_core.jl b/src/samplers/support/hmc_core.jl
@@ -57,11 +57,7 @@ function gen_rev_func(vi, spl)
     elseif ADBACKEND == :reverse_diff
       vi_spl = vi[spl]
       for i = 1:length(θ_old)
-        if isa(vi_spl[i], ReverseDiff.TrackedReal)
-          vi_spl[i].value = θ_old[i]
-        else
-          vi_spl[i] = θ_old[i]
-        end
+        vi_spl[i] = θ_old[i]
       end
     end
     setlogp!(vi, old_logp)
diff --git a/src/transform.jl b/src/transform.jl
@@ -348,7 +348,7 @@ invlink(d::PDMatDistribution, Z::Vector{Matrix{T}}) where {T<:Real} = begin
   Z
 end
 
-logpdf_with_trans(d::PDMatDistribution, x::Array{T,2}, transform::Bool) where {T<:Real} = begin
+logpdf_with_trans(d::PDMatDistribution, x::Array{T0,2}, transform::Bool) where {T0<:Union{T,Tracker.TrackedReal{T}}} where {T<:Real} = begin
   lp = logpdf(d, x)
   if transform && isfinite(lp)
     U = cholesky(x).U
@@ -361,11 +361,11 @@ logpdf_with_trans(d::PDMatDistribution, x::Array{T,2}, transform::Bool) where {T
   lp
 end
 
-logpdf_with_trans(d::PDMatDistribution, X::Vector{Matrix{T}}, transform::Bool) where {T<:Real} = begin
+logpdf_with_trans(d::PDMatDistribution, X::Vector{Matrix{T0}}, transform::Bool) where {T0<:Union{T,Tracker.TrackedReal{T}}} where {T<:Real} = begin
   lp = logpdf(d, X)
   if transform && all(isfinite.(lp))
     n = length(X)
-    U = Vector{Matrix{T}}(undef, n)
+    U = Vector{Matrix{T0}}(undef, n)
     for i = 1:n
       U[i] = cholesky(X[i]).U'
     end
diff --git a/test/hmcda.jl/hmcda.jl b/test/hmcda.jl/hmcda.jl
@@ -1,6 +1,7 @@
 using Turing
 using Test
 using Random
+using Distributions
 
 Random.seed!(128)