[NN] Breaking, batch_size is now first (NCHW default) fix #132

mratsim · Dec 16, 2017 · 1024c7a · 1024c7a
1 parent a76c36d
commit 1024c7a
Show file tree

Hide file tree

Showing 8 changed files with 158 additions and 186 deletions.
diff --git a/benchmarks/ex01_xor.nim b/benchmarks/ex01_xor.nim
@@ -1,21 +1,27 @@
 import ../src/arraymancer
 
+# Learning XOR function with a neural network.
+
+# Autograd context / neuralnet graph
 let ctx = newContext Tensor[float32]
 
-let bsz = 32 #batch size
+let bsz = 32 # batch size
 
-# We will create a tensor of size 3200 --> 100 batch sizes of 32
-# We create it as int between [0, 2[ (2 excluded) and convert to bool
+# We will create a tensor of size 3200 (100 batches of size 32)
+# We create it as int between [0, 2[ and convert to bool
 let x_train_bool = randomTensor([bsz * 100, 2], 2).astype(bool)
 
-# Let's build or truth labels. We need to apply xor between the 2 columns of the tensors
+# Let's build our truth labels. We need to apply xor between the 2 columns of the tensors
 let y_bool = x_train_bool[_,0] xor x_train_bool[_,1]
 
-# Convert to float and transpose so batch_size is last
-let x_train = ctx.variable(x_train_bool.astype(float32).transpose)
-let y = y_bool.astype(float32).transpose
+# Convert to float
+let x_train = ctx.variable(x_train_bool.astype(float32))
+let y = y_bool.astype(float32)
+
+# We will build the following network:
+# Input --> Linear(out_features = 3) --> relu --> Linear(out_features = 1) --> Sigmoid --> Cross-Entropy Loss
 
-# First hidden layer of 3 neurons, with 2 features in
+# First hidden layer of 3 neurons, shape [3 out_features, 2 in_features]
 # We initialize with random weights between -1 and 1
 let layer_3neurons = ctx.variable(
                       randomTensor(3, 2, 2.0f) .- 1.0f
@@ -32,19 +38,18 @@ let optim = newSGD[float32](
   layer_3neurons, classifier_layer, 0.01f # 0.01 is the learning rate
 )
 
+# Learning loop
 for epoch in 0..10000:
-
   for batch_id in 0..<100:
 
-    # offset in the Tensor (Remember, batch size is last)
+    # minibatch offset in the Tensor
     let offset = batch_id * 32
-    let x = x_train[_, offset ..< offset + 32]
-    let target = y[_, offset ..< offset + 32]
+    let x = x_train[offset ..< offset + 32, _]
+    let target = y[offset ..< offset + 32, _]
 
     # Building the network
-    let n1 = linear(x, layer_3neurons)
-    let n1_relu = n1.relu
-    let n2 = linear(n1_relu, classifier_layer)
+    let n1 = relu linear(x, layer_3neurons)
+    let n2 = linear(n1, classifier_layer)
     let loss = sigmoid_cross_entropy(n2, target)
 
     # Compute the gradient (i.e. contribution of each parameter to the loss)

diff --git a/examples/ex01_xor_perceptron_from_scratch.nim b/examples/ex01_xor_perceptron_from_scratch.nim
@@ -1,82 +1,28 @@
 import ../src/arraymancer
 
-# Example multilayer perceptron in Arraymancer.
-
-# We will use as examples the OR function similar to this article:
-# https://blog.dbrgn.ch/2013/3/26/perceptrons-in-python/
-
-
-# Okay let's start
-# With x and y being one sample, the perceptron equation is
-#
-# Layer 1
-# n1 = relu(a1 * x + b1 * y + c1) # First neuron + relu activation
-# n2 = relu(a2 * x + b2 * y + c2) # 2nd neuron + relu activation
-# n3 = relu(a3 * x + b3 * y + c3) # 3nd neuron + relu activation
-#
-# Layer 2
-# classifier =  a4 * n1 + b4 * n2 + c4 * n3
-#
-# Loss
-# loss = cross_entropy(sigmoid(classifier))
-
-# In terms of high level layers this becomes:
-# Input --> Linear(out_features = 3) --> relu --> Linear(out_features = 1) --> Sigmoid --> Cross-Entropy Loss
-
-# Let's go
+# Learning XOR function with a neural network.
 
-# First create a context that will store backpropagation information
+# Autograd context / neuralnet graph
 let ctx = newContext Tensor[float32]
 
-# We will pass batches of 32 samples
-let bsz = 32 #batch size
-
-# We will create a tensor of size 3200 --> 100 batch sizes of 32
-# We create it as int between [0, 2[ (2 excluded) and convert to bool
-let x_train_bool = randomTensor([bsz * 100, 2], 2).astype(bool) # generate batch_size examples of (0,1) combination
-
-# Let's check the first 32
-echo x_train_bool[0..<32, _]
-# Tensor of shape 32x2 of type "bool" on backend "Cpu"
-# |true   false|
-# |true   true|
-# |false  false|
-# |false  true|
-# |false  false|
-# |false  false|
-# |false  false|
-# ...
-
-# Let's build or truth labels. We need to apply xor between the 2 columns of the tensors
+let bsz = 32 # batch size
+
+# We will create a tensor of size 3200 (100 batches of size 32)
+# We create it as int between [0, 2[ and convert to bool
+let x_train_bool = randomTensor([bsz * 100, 2], 2).astype(bool)
+
+# Let's build our truth labels. We need to apply xor between the 2 columns of the tensors
 let y_bool = x_train_bool[_,0] xor x_train_bool[_,1]
 
+# Convert to float
+let x_train = ctx.variable(x_train_bool.astype(float32))
+let y = y_bool.astype(float32)
+
+# We will build the following network:
+# Input --> Linear(out_features = 3) --> relu --> Linear(out_features = 1) --> Sigmoid --> Cross-Entropy Loss
 
-echo y_bool[0..<32, _]
-# Tensor of shape 32x1 of type "bool" on backend "Cpu"
-#         true|
-#         false|
-#         false|
-#         true|
-#         false|
-#         false|
-#         false|
-#         true|
-#         false|
-#         ...
-
-# Convert to float.
-# Important: At the moment, Arraymancer expects batch size to be last
-# so we transpose. In the future Arraymancer will be flexible.
-let x_train = ctx.variable(x_train_bool.astype(float32).transpose)
-let y = y_bool.astype(float32).transpose
-
-# Now we create layer of neurons W that we will train to reproduce the xor function.
-# Weights are of this shape: [W: out_features, in_features]
-
-# First hidden layer of 3 neurons, with 2 features in
+# First hidden layer of 3 neurons, shape [3 out_features, 2 in_features]
 # We initialize with random weights between -1 and 1
-# (We initialize them between 0.0f and 2.0f and then minus 1.0f)
-# .- is the minus broadcasting operator
 let layer_3neurons = ctx.variable(
                       randomTensor(3, 2, 2.0f) .- 1.0f
                       )
@@ -86,40 +32,28 @@ let layer_3neurons = ctx.variable(
 let classifier_layer = ctx.variable(
                   randomTensor(1, 3, 2.0f) .- 1.0f
                   )
-# We use Stochastic Gradient Descent as optimizer
-# With gradient descent the weigth are updated as follows:
-# W -= learning_rate * dW
+
+# Stochastic Gradient Descent
 let optim = newSGD[float32](
   layer_3neurons, classifier_layer, 0.01f # 0.01 is the learning rate
 )
 
-# Now let's setup the training loops.
-# First loop is passing the mini-batch, bacpropagating, updating the gradients.
-# We do it until the whole x_train tensor has been passed through.
-# This is one "epoch".
-
-# Usually after each epoch we "validate" with a test set that the network was never trained on
-# how the network generalized. In this example we won't go there to keep it short.
-
-# We will do 5 epochs, passing the 32*100 minibatches
+# Learning loop
 for epoch in 0..5:
-
   for batch_id in 0..<100:
 
-    # offset in the Tensor (Remember, batch size is last)
+    # minibatch offset in the Tensor
     let offset = batch_id * 32
-    let x = x_train[_, offset ..< offset + 32]
-    let target = y[_, offset ..< offset + 32]
+    let x = x_train[offset ..< offset + 32, _]
+    let target = y[offset ..< offset + 32, _]
 
     # Building the network
-    let n1 = linear(x, layer_3neurons)
-    let n1_relu = n1.relu
-    let n2 = linear(n1_relu, classifier_layer)
+    let n1 = relu linear(x, layer_3neurons)
+    let n2 = linear(n1, classifier_layer)
     let loss = sigmoid_cross_entropy(n2, target)
 
     echo "Epoch is:" & $epoch
     echo "Batch id:" & $batch_id
-
     echo "Loss is:" & $loss.value.data[0]
 
     # Compute the gradient (i.e. contribution of each parameter to the loss)

diff --git a/src/nn/layers/linear.nim b/src/nn/layers/linear.nim
@@ -14,33 +14,52 @@
 
 import  ../../private/ast_utils,
         ../../tensor/tensor,
+        ../../nn_primitives/nn_primitives,
         ../../autograd/autograd,
         ./layer
 
 
 
 type LinearGate* {.final.} [TT] = ref object of Gate[TT]
   ## TODO: use fused AddMatMul gate: C <- alpha AB + beta C
-  x, W, b: Variable[TT]
+  input, weight, bias: Variable[TT]
 
-method forward*[TT](self: LinearGate[TT], a: Variable[TT]): Variable[TT] {.inline, locks:0.}=
+method forward*[TT](self: LinearGate[TT], input: Variable[TT]): Variable[TT] {.inline, locks:0.}=
   new result
 
-  result.tape = a.tape
-  result.value = self.W.value * a.value
-  if not self.b.isNil:
-    result.value .+= self.b.value # Bias is broadcasted other the whole batch size
-  result.grad = zeros_like(result.value)
-
-method backward*[TT](self: LinearGate[TT], gradient: TT): SmallDiffs[TT] {.noInit, inline, locks:0.}=
-  result[0] = self.W.value.transpose * gradient # grad w.r.t. x
-  result[1] = gradient * self.x.value.transpose # grad w.r.t. weight
+  if self.bias.isNil:
+    linear(input.value, self.weight.value, result.value)
+  else:
+    linear(input.value, self.weight.value, self.bias.value, result.value)
 
-  if not self.b.isNil:
-    result[2] = sum(gradient, axis=0) # grad w.r.t. bias
-    # https://mlxai.github.io/2017/01/10/a-modular-approach-to-implementing-fully-connected-neural-networks.html
+  result.tape = input.tape
+  result.grad = zeros_like(result.value)
 
-proc linear*[TT](x, weight: Variable[TT], bias: Variable[TT] = nil): Variable[TT] =
+method backward*[TT](self: LinearGate[TT], gradOutput: TT): SmallDiffs[TT] {.noInit, inline, locks:0.}=
+  # result[0] grad w.r.t. input
+  # result[1] grad w.r.t. weight
+  # result[2] grad w.r.t. bias
+
+  if self.bias.isNil:
+    linear_backward(
+      self.input.value,
+      self.weight.value,
+      gradOutput,
+      result[0],
+      result[1]
+    )
+  else:
+    linear_backward(
+      self.input.value,
+      self.weight.value,
+      self.bias.value,
+      gradOutput,
+      result[0],
+      result[1],
+      result[2]
+    )
+
+proc linear*[TT](input, weight: Variable[TT], bias: Variable[TT] = nil): Variable[TT] =
   ## Input:
   ##   - A x Variable of shape [in_features, batch_size]
   ##   - A weight Variable of shape [out_features, in_features]
@@ -57,12 +76,12 @@ proc linear*[TT](x, weight: Variable[TT], bias: Variable[TT] = nil): Variable[TT
   ##  - Experimental, there is no tests yet for this layer
 
   when compileOption("boundChecks"):
-    if x.value.rank > 2:
+    if input.value.rank > 2:
       raise newException(ValueError, "Tensor must be flattened for a linear layer (features, batch_size)")
 
-    check_ctx(x, weight)
+    check_ctx(input, weight)
     if not bias.isNil:
-      check_ctx(x, bias)
+      check_ctx(input, bias)
 
     # weight has shape: Out_features * In_features
     # bias must have shape: Out_features * 1
@@ -73,23 +92,23 @@ proc linear*[TT](x, weight: Variable[TT], bias: Variable[TT] = nil): Variable[TT
   var gate: LinearGate[TT]
   new gate
   gate.arity = if bias.isNil: 2 else: 3
-  gate.x = x
-  gate.W = weight
-  gate.b = bias
+  gate.input = input
+  gate.weight = weight
+  gate.bias = bias
 
   # Node
   var node: Node[TT]
   new node
 
   node.gate = gate
-  node.parents[0] = x
+  node.parents[0] = input
   node.parents[1] = weight
   if not bias.isNil:
     node.parents[2] = bias
 
-  x.tape.push(node)
+  input.tape.push(node)
 
   # Resulting var
-  result = gate.forward(x)
+  result = gate.forward(input)
   result.ancestor = node
   node.child = result
diff --git a/src/nn/loss/cross_entropy_losses.nim b/src/nn/loss/cross_entropy_losses.nim
@@ -27,7 +27,7 @@ template gen_cross_entropy_loss(LossType, forward_proc, backward_proc: untyped)
     # target, from Loss
 
   method forward*[TT](self: LossType[TT], a: Variable[TT], target: TT): Variable[TT] {.inline, locks:0.}=
-    # We expect a in shape [features, batch_size]
+    # We expect a in shape [batch_size, features]
 
     new result
     result.tape = a.tape
@@ -74,7 +74,7 @@ type SparseSoftmaxCrossEntropyLoss* {.final.} [TT] = ref object of SparseLoss[TT
   # target, from Loss
 
 method forward*[TT](self: SparseSoftmaxCrossEntropyLoss[TT], a: Variable[TT], target: Tensor[int]): Variable[TT] {.inline, locks:0.}=
-  # We expect a in shape [features, batch_size]
+  # We expect a in shape [batch_size, features]
 
   new result
   result.tape = a.tape