Skip to content

Commit

Permalink
[NN] Breaking, batch_size is now first (NCHW default) fix #132
Browse files Browse the repository at this point in the history
  • Loading branch information
mratsim committed Dec 16, 2017
1 parent a76c36d commit 1024c7a
Show file tree
Hide file tree
Showing 8 changed files with 158 additions and 186 deletions.
35 changes: 20 additions & 15 deletions benchmarks/ex01_xor.nim
Original file line number Diff line number Diff line change
@@ -1,21 +1,27 @@
import ../src/arraymancer

# Learning XOR function with a neural network.

# Autograd context / neuralnet graph
let ctx = newContext Tensor[float32]

let bsz = 32 #batch size
let bsz = 32 # batch size

# We will create a tensor of size 3200 --> 100 batch sizes of 32
# We create it as int between [0, 2[ (2 excluded) and convert to bool
# We will create a tensor of size 3200 (100 batches of size 32)
# We create it as int between [0, 2[ and convert to bool
let x_train_bool = randomTensor([bsz * 100, 2], 2).astype(bool)

# Let's build or truth labels. We need to apply xor between the 2 columns of the tensors
# Let's build our truth labels. We need to apply xor between the 2 columns of the tensors
let y_bool = x_train_bool[_,0] xor x_train_bool[_,1]

# Convert to float and transpose so batch_size is last
let x_train = ctx.variable(x_train_bool.astype(float32).transpose)
let y = y_bool.astype(float32).transpose
# Convert to float
let x_train = ctx.variable(x_train_bool.astype(float32))
let y = y_bool.astype(float32)

# We will build the following network:
# Input --> Linear(out_features = 3) --> relu --> Linear(out_features = 1) --> Sigmoid --> Cross-Entropy Loss

# First hidden layer of 3 neurons, with 2 features in
# First hidden layer of 3 neurons, shape [3 out_features, 2 in_features]
# We initialize with random weights between -1 and 1
let layer_3neurons = ctx.variable(
randomTensor(3, 2, 2.0f) .- 1.0f
Expand All @@ -32,19 +38,18 @@ let optim = newSGD[float32](
layer_3neurons, classifier_layer, 0.01f # 0.01 is the learning rate
)

# Learning loop
for epoch in 0..10000:

for batch_id in 0..<100:

# offset in the Tensor (Remember, batch size is last)
# minibatch offset in the Tensor
let offset = batch_id * 32
let x = x_train[_, offset ..< offset + 32]
let target = y[_, offset ..< offset + 32]
let x = x_train[offset ..< offset + 32, _]
let target = y[offset ..< offset + 32, _]

# Building the network
let n1 = linear(x, layer_3neurons)
let n1_relu = n1.relu
let n2 = linear(n1_relu, classifier_layer)
let n1 = relu linear(x, layer_3neurons)
let n2 = linear(n1, classifier_layer)
let loss = sigmoid_cross_entropy(n2, target)

# Compute the gradient (i.e. contribution of each parameter to the loss)
Expand Down
114 changes: 24 additions & 90 deletions examples/ex01_xor_perceptron_from_scratch.nim
Original file line number Diff line number Diff line change
@@ -1,82 +1,28 @@
import ../src/arraymancer

# Example multilayer perceptron in Arraymancer.

# We will use as examples the OR function similar to this article:
# https://blog.dbrgn.ch/2013/3/26/perceptrons-in-python/


# Okay let's start
# With x and y being one sample, the perceptron equation is
#
# Layer 1
# n1 = relu(a1 * x + b1 * y + c1) # First neuron + relu activation
# n2 = relu(a2 * x + b2 * y + c2) # 2nd neuron + relu activation
# n3 = relu(a3 * x + b3 * y + c3) # 3nd neuron + relu activation
#
# Layer 2
# classifier = a4 * n1 + b4 * n2 + c4 * n3
#
# Loss
# loss = cross_entropy(sigmoid(classifier))

# In terms of high level layers this becomes:
# Input --> Linear(out_features = 3) --> relu --> Linear(out_features = 1) --> Sigmoid --> Cross-Entropy Loss

# Let's go
# Learning XOR function with a neural network.

# First create a context that will store backpropagation information
# Autograd context / neuralnet graph
let ctx = newContext Tensor[float32]

# We will pass batches of 32 samples
let bsz = 32 #batch size

# We will create a tensor of size 3200 --> 100 batch sizes of 32
# We create it as int between [0, 2[ (2 excluded) and convert to bool
let x_train_bool = randomTensor([bsz * 100, 2], 2).astype(bool) # generate batch_size examples of (0,1) combination

# Let's check the first 32
echo x_train_bool[0..<32, _]
# Tensor of shape 32x2 of type "bool" on backend "Cpu"
# |true false|
# |true true|
# |false false|
# |false true|
# |false false|
# |false false|
# |false false|
# ...

# Let's build or truth labels. We need to apply xor between the 2 columns of the tensors
let bsz = 32 # batch size

# We will create a tensor of size 3200 (100 batches of size 32)
# We create it as int between [0, 2[ and convert to bool
let x_train_bool = randomTensor([bsz * 100, 2], 2).astype(bool)

# Let's build our truth labels. We need to apply xor between the 2 columns of the tensors
let y_bool = x_train_bool[_,0] xor x_train_bool[_,1]

# Convert to float
let x_train = ctx.variable(x_train_bool.astype(float32))
let y = y_bool.astype(float32)

# We will build the following network:
# Input --> Linear(out_features = 3) --> relu --> Linear(out_features = 1) --> Sigmoid --> Cross-Entropy Loss

echo y_bool[0..<32, _]
# Tensor of shape 32x1 of type "bool" on backend "Cpu"
# true|
# false|
# false|
# true|
# false|
# false|
# false|
# true|
# false|
# ...

# Convert to float.
# Important: At the moment, Arraymancer expects batch size to be last
# so we transpose. In the future Arraymancer will be flexible.
let x_train = ctx.variable(x_train_bool.astype(float32).transpose)
let y = y_bool.astype(float32).transpose

# Now we create layer of neurons W that we will train to reproduce the xor function.
# Weights are of this shape: [W: out_features, in_features]

# First hidden layer of 3 neurons, with 2 features in
# First hidden layer of 3 neurons, shape [3 out_features, 2 in_features]
# We initialize with random weights between -1 and 1
# (We initialize them between 0.0f and 2.0f and then minus 1.0f)
# .- is the minus broadcasting operator
let layer_3neurons = ctx.variable(
randomTensor(3, 2, 2.0f) .- 1.0f
)
Expand All @@ -86,40 +32,28 @@ let layer_3neurons = ctx.variable(
let classifier_layer = ctx.variable(
randomTensor(1, 3, 2.0f) .- 1.0f
)
# We use Stochastic Gradient Descent as optimizer
# With gradient descent the weigth are updated as follows:
# W -= learning_rate * dW

# Stochastic Gradient Descent
let optim = newSGD[float32](
layer_3neurons, classifier_layer, 0.01f # 0.01 is the learning rate
)

# Now let's setup the training loops.
# First loop is passing the mini-batch, bacpropagating, updating the gradients.
# We do it until the whole x_train tensor has been passed through.
# This is one "epoch".

# Usually after each epoch we "validate" with a test set that the network was never trained on
# how the network generalized. In this example we won't go there to keep it short.

# We will do 5 epochs, passing the 32*100 minibatches
# Learning loop
for epoch in 0..5:

for batch_id in 0..<100:

# offset in the Tensor (Remember, batch size is last)
# minibatch offset in the Tensor
let offset = batch_id * 32
let x = x_train[_, offset ..< offset + 32]
let target = y[_, offset ..< offset + 32]
let x = x_train[offset ..< offset + 32, _]
let target = y[offset ..< offset + 32, _]

# Building the network
let n1 = linear(x, layer_3neurons)
let n1_relu = n1.relu
let n2 = linear(n1_relu, classifier_layer)
let n1 = relu linear(x, layer_3neurons)
let n2 = linear(n1, classifier_layer)
let loss = sigmoid_cross_entropy(n2, target)

echo "Epoch is:" & $epoch
echo "Batch id:" & $batch_id

echo "Loss is:" & $loss.value.data[0]

# Compute the gradient (i.e. contribution of each parameter to the loss)
Expand Down
67 changes: 43 additions & 24 deletions src/nn/layers/linear.nim
Original file line number Diff line number Diff line change
Expand Up @@ -14,33 +14,52 @@

import ../../private/ast_utils,
../../tensor/tensor,
../../nn_primitives/nn_primitives,
../../autograd/autograd,
./layer



type LinearGate* {.final.} [TT] = ref object of Gate[TT]
## TODO: use fused AddMatMul gate: C <- alpha AB + beta C
x, W, b: Variable[TT]
input, weight, bias: Variable[TT]

method forward*[TT](self: LinearGate[TT], a: Variable[TT]): Variable[TT] {.inline, locks:0.}=
method forward*[TT](self: LinearGate[TT], input: Variable[TT]): Variable[TT] {.inline, locks:0.}=
new result

result.tape = a.tape
result.value = self.W.value * a.value
if not self.b.isNil:
result.value .+= self.b.value # Bias is broadcasted other the whole batch size
result.grad = zeros_like(result.value)

method backward*[TT](self: LinearGate[TT], gradient: TT): SmallDiffs[TT] {.noInit, inline, locks:0.}=
result[0] = self.W.value.transpose * gradient # grad w.r.t. x
result[1] = gradient * self.x.value.transpose # grad w.r.t. weight
if self.bias.isNil:
linear(input.value, self.weight.value, result.value)
else:
linear(input.value, self.weight.value, self.bias.value, result.value)

if not self.b.isNil:
result[2] = sum(gradient, axis=0) # grad w.r.t. bias
# https://mlxai.github.io/2017/01/10/a-modular-approach-to-implementing-fully-connected-neural-networks.html
result.tape = input.tape
result.grad = zeros_like(result.value)

proc linear*[TT](x, weight: Variable[TT], bias: Variable[TT] = nil): Variable[TT] =
method backward*[TT](self: LinearGate[TT], gradOutput: TT): SmallDiffs[TT] {.noInit, inline, locks:0.}=
# result[0] grad w.r.t. input
# result[1] grad w.r.t. weight
# result[2] grad w.r.t. bias

if self.bias.isNil:
linear_backward(
self.input.value,
self.weight.value,
gradOutput,
result[0],
result[1]
)
else:
linear_backward(
self.input.value,
self.weight.value,
self.bias.value,
gradOutput,
result[0],
result[1],
result[2]
)

proc linear*[TT](input, weight: Variable[TT], bias: Variable[TT] = nil): Variable[TT] =
## Input:
## - A x Variable of shape [in_features, batch_size]
## - A weight Variable of shape [out_features, in_features]
Expand All @@ -57,12 +76,12 @@ proc linear*[TT](x, weight: Variable[TT], bias: Variable[TT] = nil): Variable[TT
## - Experimental, there is no tests yet for this layer

when compileOption("boundChecks"):
if x.value.rank > 2:
if input.value.rank > 2:
raise newException(ValueError, "Tensor must be flattened for a linear layer (features, batch_size)")

check_ctx(x, weight)
check_ctx(input, weight)
if not bias.isNil:
check_ctx(x, bias)
check_ctx(input, bias)

# weight has shape: Out_features * In_features
# bias must have shape: Out_features * 1
Expand All @@ -73,23 +92,23 @@ proc linear*[TT](x, weight: Variable[TT], bias: Variable[TT] = nil): Variable[TT
var gate: LinearGate[TT]
new gate
gate.arity = if bias.isNil: 2 else: 3
gate.x = x
gate.W = weight
gate.b = bias
gate.input = input
gate.weight = weight
gate.bias = bias

# Node
var node: Node[TT]
new node

node.gate = gate
node.parents[0] = x
node.parents[0] = input
node.parents[1] = weight
if not bias.isNil:
node.parents[2] = bias

x.tape.push(node)
input.tape.push(node)

# Resulting var
result = gate.forward(x)
result = gate.forward(input)
result.ancestor = node
node.child = result
4 changes: 2 additions & 2 deletions src/nn/loss/cross_entropy_losses.nim
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ template gen_cross_entropy_loss(LossType, forward_proc, backward_proc: untyped)
# target, from Loss

method forward*[TT](self: LossType[TT], a: Variable[TT], target: TT): Variable[TT] {.inline, locks:0.}=
# We expect a in shape [features, batch_size]
# We expect a in shape [batch_size, features]

new result
result.tape = a.tape
Expand Down Expand Up @@ -74,7 +74,7 @@ type SparseSoftmaxCrossEntropyLoss* {.final.} [TT] = ref object of SparseLoss[TT
# target, from Loss

method forward*[TT](self: SparseSoftmaxCrossEntropyLoss[TT], a: Variable[TT], target: Tensor[int]): Variable[TT] {.inline, locks:0.}=
# We expect a in shape [features, batch_size]
# We expect a in shape [batch_size, features]

new result
result.tape = a.tape
Expand Down
Loading

0 comments on commit 1024c7a

Please sign in to comment.