denizyuret · ylxdzsw · Aug 7, 2017 · Aug 8, 2017 · Aug 9, 2017 · Aug 11, 2017
diff --git a/examples/charlm_using_modular_interface.jl b/examples/charlm_using_modular_interface.jl
@@ -0,0 +1,88 @@
+VERSION < v"0.6-" && error("currenctly modular interface only works for julia v0.6+")
+
+using Knet
+
+# load data
+
+const data = readstring(Knet.dir("data","10.txt"));
+const dict = unique(data);
+
+init(x...) = .2rand(x...) .- .1
+
+# define model
+
+const model = let
+    encoder = Embedding(length(dict), 64, init=init)
+    lstm1   = LSTM(64, 256, init=init)
+    lstm2   = LSTM(256, 256, init=init)
+    decoder = Affine(256, length(dict), init=init)
+
+    function CharLM(x, h)
+        h1, c1, h2, c2 = h
+
+        input  = encoder(x)
+        h1, c1 = lstm1(input, h1, c1)
+        h2, c2 = lstm2(h1,    h2, c2)
+        result = logp(decoder(h2), 2)
+
+        result, (h1, c1, h2, c2)
+    end
+end
+
+# train model
+
+function bptt!(seq, seqlen=length(seq), batchsize=length(seq[1]))
+    seq = track(seq) # enable auto diff
+    h   = ntuple(i->zeros(batchsize, 256), 4)
+
+    loss = 0
+    for i in 1:seqlen-1
+        y = getval(seq)[i+1]
+        pred, h = model(seq[i], h)
+        index = map(1:batchsize) do i
+            batchsize * (y[i] - 1) + i
+        end
+        loss += -sum(pred[index])
+    end
+
+    println("loss: ", getval(loss))
+
+    back!(loss, 1)
+end
+
+for epoch in 1:1000
+    println("epoch: $epoch")
+
+    seqlen = min(20 + epoch, 150+rand(0:5))
+    batchsize = 32
+
+    for nbatch in 0:length(data)÷(batchsize*seqlen)-2
+        seq = map(1:seqlen) do i
+            [findfirst(dict, data[nbatch*batchsize*seqlen + j*seqlen + i]) for j in 0:batchsize-1]
+        end
+
+        bptt!(seq)
+
+        for p in params(model)
+            update!(getval(p), getgrad(p), lr=.001)
+        end
+    end
+end
+
+# generate a seq
+
+function sample(p)
+    r = rand()
+    for c = 1:length(p)
+        r -= p[c]
+        r <= 0 && return dict[c]
+    end
+end
+
+h, last = ntuple(i->zeros(1, 256), 4), '\n'
+
+for i in 1:800
+    pred, h = model([findfirst(dict, last)], h)
+    last = sample(exp.(pred))
+    print(last)
+end
diff --git a/src/Knet.jl b/src/Knet.jl
@@ -25,6 +25,8 @@ include("distributions.jl"); 	export gaussian, xavier, bilinear
 include("random.jl");           export setseed
 include("hyperopt.jl");         export hyperband, goldensection
 
+VERSION > v"0.6-" && include("model.jl")
+
 """
     Knet.dir(path...)
 

diff --git a/src/model.jl b/src/model.jl
@@ -0,0 +1,211 @@
+export track, back!, getgrad, setgrad!, params
+
+abstract type Model end
+
+"""
+    tracked_x = track(x)
+
+track an input, `x` can be tuple, array or dict. If x is already tracked, it will be replaced with a new tape.
+
+    tracked_p = track(p, x)
+
+track p on the same tape of x, return tracked p. If x is not tracked, it will also untrack p.
+"""
+track(x::AutoGrad.Rec) = track(x.value)
+track(x, tape::AutoGrad.Tape=AutoGrad.Tape()) = AutoGrad.Rec(x, tape)
+track(x, y) = x
+track(x, y::AutoGrad.Rec) = AutoGrad.Rec(x, y.tapes[])
+track(x::AutoGrad.Rec, y) = x.value
+track(x::AutoGrad.Rec, y::AutoGrad.Rec) = x in y.tapes[] ? x : track(x.value, y)
+
+"""
+run backward pass of a tracked output, returns the gradient of input.
+
+example:
+
+```
+x = track([1,-1,1])
+p = track([2,3,4], x)
+y = x .* p
+back!(y, [1,1,1]) // =>[2,3,4]
+getgrad(p) // =>[1,-1,1]
+```
+"""
+function back!(x::AutoGrad.Rec)
+    tape = x.tapes[]
+    AutoGrad.complete!(tape)
+
+    # copied from AutoGrad.jl/src/core.jl:backward_pass
+    for n in tape[end-1:-1:1]
+        n.outgrad == nothing && continue
+        r = n.rec
+        for i=1:length(n.parents)
+            isassigned(n.parents,i) || continue
+            p = n.parents[i]
+            og = r.func(AutoGrad.Grad{i},n.outgrad,r.value,r.args...;r.kwargs...)
+            p.outgrad = AutoGrad.sum_outgrads(p.outgrad, og)
+        end
+    end
+
+    tape[1].outgrad
+end
+
+function back!(x::AutoGrad.Rec, Δ)
+    setgrad!(x, Δ)
+    back!(x)
+end
+
+"""
+Get the gradient of a tracked variable. Return nothing if `x` is not used.
+"""
+function getgrad(x::AutoGrad.Rec)
+    x.nodes[].outgrad
+end
+
+"""
+Set the gradient of a tracked variable.
+"""
+function setgrad!(x::AutoGrad.Rec, Δ)
+    x.nodes[].outgrad = Δ
+end
+
+"""
+Get a list of tracked parameters from a model.
+A model's paremeters can be tracked by running with a tracked input.
+Complex models should override this method for efficiency.
+"""
+function params(m)
+    set = ObjectIdDict()
+
+    traverse(m::AutoGrad.Rec) = set[m] = nothing
+    traverse(m) = try
+        foreach(traverse, m)
+    catch
+        for attr in fieldnames(typeof(m))
+            traverse(getfield(m, attr))
+        end
+    end
+
+    traverse(m)
+
+    collect(AutoGrad.Rec, keys(set))
+end
+
+
+export Affine, Chain, LSTM, Embedding, RNN, GRU
+
+mutable struct Affine <: Model
+    weight
+    bias
+end
+
+Affine(a::Integer, b::Integer; init=xavier) = Affine(init(a, b), zeros(1, b))
+
+function (m::Affine)(x)
+    m.weight = track(m.weight, x)
+    m.bias   = track(m.bias, x)
+    x * m.weight .+ m.bias
+end
+
+mutable struct Chain <: Model
+    layers::Vector
+end
+
+Chain(x...) = Chain(collect(x))
+
+(m::Chain)(x) = foldl((x, m) -> m(x), x, m.layers)
+
+mutable struct LSTM <: Model
+    weight
+    bias
+end
+
+function LSTM(a::Integer, b::Integer; init=rand)
+    LSTM(init(a+b, 4b), zeros(1, 4b))
+end
+
+# patch for ambiguity of Base/abstractarray.jl:1067 and AutoGrad/abstractarray.jl:168
+Base.hcat(a::AutoGrad.Rec, b::AutoGrad.Rec, c::AutoGrad.Rec...) = AutoGrad.cat(2, a, b, c...)
+
+function (m::LSTM)(x, hidden, cell)
+    m.weight = track(m.weight, x)
+    m.bias   = track(m.bias, x)
+
+    gates   = [x hidden] * m.weight .+ m.bias
+    hsize   = size(hidden,2)
+    forget  = sigm(gates[:,1:hsize])
+    ingate  = sigm(gates[:,1+hsize:2hsize])
+    outgate = sigm(gates[:,1+2hsize:3hsize])
+    change  = tanh(gates[:,1+3hsize:end])
+    cell    = cell .* forget + ingate .* change
+    hidden  = outgate .* tanh(cell)
+    hidden, cell
+end
+
+"""
+`back!` through Embedding will lose all gradients
+"""
+mutable struct Embedding <: Model
+    mat
+end
+
+function Embedding(a::Integer, b::Integer; init=rand)
+    Embedding(init(a, b))
+end
+
+function (m::Embedding)(x::AutoGrad.Rec)
+    m.mat = track(m.mat, x)
+    m.mat[x.value, :]
+end
+
+function (m::Embedding)(x)
+    m.mat = track(m.mat, x)
+    m.mat[x, :]
+end
+
+mutable struct RNN <: Model
+    weight
+    bias
+end
+
+function RNN(a::Integer, b::Integer; init=rand)
+    RNN(init(a+b, b), zeros(1, b))
+end
+
+function (m::RNN)(x, h)
+    m.weight = track(m.weight, x)
+    m.bias   = track(m.bias, x)
+
+    tanh([x h] * m.weight .+ m.bias)
+end
+
+mutable struct GRU <: Model
+    Wih
+    Whh
+    bih
+    bhh
+end
+
+function GRU(a::Integer, b::Integer; init=rand)
+    GRU(init(a, 3b), init(b, 3b), zeros(1, 3b), zeros(1, 3b))
+end
+
+function (m::GRU)(x, h)
+    m.Wih = track(m.Wih, x)
+    m.Whh = track(m.Whh, x)
+    m.bih = track(m.bih, x)
+    m.bhh = track(m.bhh, x)
+
+    gi = x * m.Wih .+ m.bih
+    gh = h * m.Whh .+ m.bhh
+
+    hsize = size(h, 2)
+
+    ir, ii, in = gi[:, 1:hsize], gi[:, 1+hsize:2hsize], gi[:, 1+2hsize:3hsize]
+    hr, hi, hn = gh[:, 1:hsize], gh[:, 1+hsize:2hsize], gh[:, 1+2hsize:3hsize]
+
+    rgate = sigm(ir + hr)
+    igate = sigm(ii + hi)
+    ngate = tanh(in + rgate .* hn)
+    ngate + igate .* (h - ngate)
+end
diff --git a/src/unary.jl b/src/unary.jl
@@ -122,6 +122,7 @@ for (f,g,y,dx) in
         end
         $f{T<:Number}(xi::T)=$y
         $g{T<:Number}(dyi::T,yi::T)=$dx
+        $g(dy::AutoGrad.Rec,y)=$g(dy.value,y)
         @primitive $f(x),dy,y $g(dy,y)
     end
 end