From 1024c7af4d488a5580d0b94144cd88bd071ae66d Mon Sep 17 00:00:00 2001
From: mratsim <mamy.ratsimbazafy_dev@gadz.org>
Date: Sat, 16 Dec 2017 20:22:01 +0100
Subject: [PATCH] [NN] Breaking, batch_size is now first (NCHW default) fix
 https://github.com/mratsim/Arraymancer/issues/132

---
 benchmarks/ex01_xor.nim                       |  35 +++---
 examples/ex01_xor_perceptron_from_scratch.nim | 114 ++++--------------
 src/nn/layers/linear.nim                      |  67 ++++++----
 src/nn/loss/cross_entropy_losses.nim          |   4 +-
 src/nn_primitives/nnp_linear.nim              |  63 ++++++----
 .../nnp_sigmoid_cross_entropy.nim             |  11 +-
 .../nnp_softmax_cross_entropy.nim             |  34 +++---
 tests/nn_primitives/test_nnp_loss.nim         |  16 +--
 8 files changed, 158 insertions(+), 186 deletions(-)

diff --git a/benchmarks/ex01_xor.nim b/benchmarks/ex01_xor.nim
index c6becf772..2f7251b9a 100644
--- a/benchmarks/ex01_xor.nim
+++ b/benchmarks/ex01_xor.nim
@@ -1,21 +1,27 @@
 import ../src/arraymancer
 
+# Learning XOR function with a neural network.
+
+# Autograd context / neuralnet graph
 let ctx = newContext Tensor[float32]
 
-let bsz = 32 #batch size
+let bsz = 32 # batch size
 
-# We will create a tensor of size 3200 --> 100 batch sizes of 32
-# We create it as int between [0, 2[ (2 excluded) and convert to bool
+# We will create a tensor of size 3200 (100 batches of size 32)
+# We create it as int between [0, 2[ and convert to bool
 let x_train_bool = randomTensor([bsz * 100, 2], 2).astype(bool)
 
-# Let's build or truth labels. We need to apply xor between the 2 columns of the tensors
+# Let's build our truth labels. We need to apply xor between the 2 columns of the tensors
 let y_bool = x_train_bool[_,0] xor x_train_bool[_,1]
 
-# Convert to float and transpose so batch_size is last
-let x_train = ctx.variable(x_train_bool.astype(float32).transpose)
-let y = y_bool.astype(float32).transpose
+# Convert to float
+let x_train = ctx.variable(x_train_bool.astype(float32))
+let y = y_bool.astype(float32)
+
+# We will build the following network:
+# Input --> Linear(out_features = 3) --> relu --> Linear(out_features = 1) --> Sigmoid --> Cross-Entropy Loss
 
-# First hidden layer of 3 neurons, with 2 features in
+# First hidden layer of 3 neurons, shape [3 out_features, 2 in_features]
 # We initialize with random weights between -1 and 1
 let layer_3neurons = ctx.variable(
                       randomTensor(3, 2, 2.0f) .- 1.0f
@@ -32,19 +38,18 @@ let optim = newSGD[float32](
   layer_3neurons, classifier_layer, 0.01f # 0.01 is the learning rate
 )
 
+# Learning loop
 for epoch in 0..10000:
-
   for batch_id in 0..<100:
 
-    # offset in the Tensor (Remember, batch size is last)
+    # minibatch offset in the Tensor
     let offset = batch_id * 32
-    let x = x_train[_, offset ..< offset + 32]
-    let target = y[_, offset ..< offset + 32]
+    let x = x_train[offset ..< offset + 32, _]
+    let target = y[offset ..< offset + 32, _]
 
     # Building the network
-    let n1 = linear(x, layer_3neurons)
-    let n1_relu = n1.relu
-    let n2 = linear(n1_relu, classifier_layer)
+    let n1 = relu linear(x, layer_3neurons)
+    let n2 = linear(n1, classifier_layer)
     let loss = sigmoid_cross_entropy(n2, target)
 
     # Compute the gradient (i.e. contribution of each parameter to the loss)
diff --git a/examples/ex01_xor_perceptron_from_scratch.nim b/examples/ex01_xor_perceptron_from_scratch.nim
index 85783529c..8f2ded869 100644
--- a/examples/ex01_xor_perceptron_from_scratch.nim
+++ b/examples/ex01_xor_perceptron_from_scratch.nim
@@ -1,82 +1,28 @@
 import ../src/arraymancer
 
-# Example multilayer perceptron in Arraymancer.
-
-# We will use as examples the OR function similar to this article:
-# https://blog.dbrgn.ch/2013/3/26/perceptrons-in-python/
-
-
-# Okay let's start
-# With x and y being one sample, the perceptron equation is
-#
-# Layer 1
-# n1 = relu(a1 * x + b1 * y + c1) # First neuron + relu activation
-# n2 = relu(a2 * x + b2 * y + c2) # 2nd neuron + relu activation
-# n3 = relu(a3 * x + b3 * y + c3) # 3nd neuron + relu activation
-#
-# Layer 2
-# classifier =  a4 * n1 + b4 * n2 + c4 * n3
-#
-# Loss
-# loss = cross_entropy(sigmoid(classifier))
-
-# In terms of high level layers this becomes:
-# Input --> Linear(out_features = 3) --> relu --> Linear(out_features = 1) --> Sigmoid --> Cross-Entropy Loss
-
-# Let's go
+# Learning XOR function with a neural network.
 
-# First create a context that will store backpropagation information
+# Autograd context / neuralnet graph
 let ctx = newContext Tensor[float32]
 
-# We will pass batches of 32 samples
-let bsz = 32 #batch size
-
-# We will create a tensor of size 3200 --> 100 batch sizes of 32
-# We create it as int between [0, 2[ (2 excluded) and convert to bool
-let x_train_bool = randomTensor([bsz * 100, 2], 2).astype(bool) # generate batch_size examples of (0,1) combination
-
-# Let's check the first 32
-echo x_train_bool[0..<32, _]
-# Tensor of shape 32x2 of type "bool" on backend "Cpu"
-# |true   false|
-# |true   true|
-# |false  false|
-# |false  true|
-# |false  false|
-# |false  false|
-# |false  false|
-# ...
-
-# Let's build or truth labels. We need to apply xor between the 2 columns of the tensors
+let bsz = 32 # batch size
+
+# We will create a tensor of size 3200 (100 batches of size 32)
+# We create it as int between [0, 2[ and convert to bool
+let x_train_bool = randomTensor([bsz * 100, 2], 2).astype(bool)
+
+# Let's build our truth labels. We need to apply xor between the 2 columns of the tensors
 let y_bool = x_train_bool[_,0] xor x_train_bool[_,1]
 
+# Convert to float
+let x_train = ctx.variable(x_train_bool.astype(float32))
+let y = y_bool.astype(float32)
+
+# We will build the following network:
+# Input --> Linear(out_features = 3) --> relu --> Linear(out_features = 1) --> Sigmoid --> Cross-Entropy Loss
 
-echo y_bool[0..<32, _]
-# Tensor of shape 32x1 of type "bool" on backend "Cpu"
-#         true|
-#         false|
-#         false|
-#         true|
-#         false|
-#         false|
-#         false|
-#         true|
-#         false|
-#         ...
-
-# Convert to float.
-# Important: At the moment, Arraymancer expects batch size to be last
-# so we transpose. In the future Arraymancer will be flexible.
-let x_train = ctx.variable(x_train_bool.astype(float32).transpose)
-let y = y_bool.astype(float32).transpose
-
-# Now we create layer of neurons W that we will train to reproduce the xor function.
-# Weights are of this shape: [W: out_features, in_features]
-
-# First hidden layer of 3 neurons, with 2 features in
+# First hidden layer of 3 neurons, shape [3 out_features, 2 in_features]
 # We initialize with random weights between -1 and 1
-# (We initialize them between 0.0f and 2.0f and then minus 1.0f)
-# .- is the minus broadcasting operator
 let layer_3neurons = ctx.variable(
                       randomTensor(3, 2, 2.0f) .- 1.0f
                       )
@@ -86,40 +32,28 @@ let layer_3neurons = ctx.variable(
 let classifier_layer = ctx.variable(
                   randomTensor(1, 3, 2.0f) .- 1.0f
                   )
-# We use Stochastic Gradient Descent as optimizer
-# With gradient descent the weigth are updated as follows:
-# W -= learning_rate * dW
+
+# Stochastic Gradient Descent
 let optim = newSGD[float32](
   layer_3neurons, classifier_layer, 0.01f # 0.01 is the learning rate
 )
 
-# Now let's setup the training loops.
-# First loop is passing the mini-batch, bacpropagating, updating the gradients.
-# We do it until the whole x_train tensor has been passed through.
-# This is one "epoch".
-
-# Usually after each epoch we "validate" with a test set that the network was never trained on
-# how the network generalized. In this example we won't go there to keep it short.
-
-# We will do 5 epochs, passing the 32*100 minibatches
+# Learning loop
 for epoch in 0..5:
-
   for batch_id in 0..<100:
 
-    # offset in the Tensor (Remember, batch size is last)
+    # minibatch offset in the Tensor
     let offset = batch_id * 32
-    let x = x_train[_, offset ..< offset + 32]
-    let target = y[_, offset ..< offset + 32]
+    let x = x_train[offset ..< offset + 32, _]
+    let target = y[offset ..< offset + 32, _]
 
     # Building the network
-    let n1 = linear(x, layer_3neurons)
-    let n1_relu = n1.relu
-    let n2 = linear(n1_relu, classifier_layer)
+    let n1 = relu linear(x, layer_3neurons)
+    let n2 = linear(n1, classifier_layer)
     let loss = sigmoid_cross_entropy(n2, target)
 
     echo "Epoch is:" & $epoch
     echo "Batch id:" & $batch_id
-
     echo "Loss is:" & $loss.value.data[0]
 
     # Compute the gradient (i.e. contribution of each parameter to the loss)
diff --git a/src/nn/layers/linear.nim b/src/nn/layers/linear.nim
index ff14aee2a..751095131 100644
--- a/src/nn/layers/linear.nim
+++ b/src/nn/layers/linear.nim
@@ -14,6 +14,7 @@
 
 import  ../../private/ast_utils,
         ../../tensor/tensor,
+        ../../nn_primitives/nn_primitives,
         ../../autograd/autograd,
         ./layer
 
@@ -21,26 +22,44 @@ import  ../../private/ast_utils,
 
 type LinearGate* {.final.} [TT] = ref object of Gate[TT]
   ## TODO: use fused AddMatMul gate: C <- alpha AB + beta C
-  x, W, b: Variable[TT]
+  input, weight, bias: Variable[TT]
 
-method forward*[TT](self: LinearGate[TT], a: Variable[TT]): Variable[TT] {.inline, locks:0.}=
+method forward*[TT](self: LinearGate[TT], input: Variable[TT]): Variable[TT] {.inline, locks:0.}=
   new result
 
-  result.tape = a.tape
-  result.value = self.W.value * a.value
-  if not self.b.isNil:
-    result.value .+= self.b.value # Bias is broadcasted other the whole batch size
-  result.grad = zeros_like(result.value)
-
-method backward*[TT](self: LinearGate[TT], gradient: TT): SmallDiffs[TT] {.noInit, inline, locks:0.}=
-  result[0] = self.W.value.transpose * gradient # grad w.r.t. x
-  result[1] = gradient * self.x.value.transpose # grad w.r.t. weight
+  if self.bias.isNil:
+    linear(input.value, self.weight.value, result.value)
+  else:
+    linear(input.value, self.weight.value, self.bias.value, result.value)
 
-  if not self.b.isNil:
-    result[2] = sum(gradient, axis=0) # grad w.r.t. bias
-    # https://mlxai.github.io/2017/01/10/a-modular-approach-to-implementing-fully-connected-neural-networks.html
+  result.tape = input.tape
+  result.grad = zeros_like(result.value)
 
-proc linear*[TT](x, weight: Variable[TT], bias: Variable[TT] = nil): Variable[TT] =
+method backward*[TT](self: LinearGate[TT], gradOutput: TT): SmallDiffs[TT] {.noInit, inline, locks:0.}=
+  # result[0] grad w.r.t. input
+  # result[1] grad w.r.t. weight
+  # result[2] grad w.r.t. bias
+
+  if self.bias.isNil:
+    linear_backward(
+      self.input.value,
+      self.weight.value,
+      gradOutput,
+      result[0],
+      result[1]
+    )
+  else:
+    linear_backward(
+      self.input.value,
+      self.weight.value,
+      self.bias.value,
+      gradOutput,
+      result[0],
+      result[1],
+      result[2]
+    )
+
+proc linear*[TT](input, weight: Variable[TT], bias: Variable[TT] = nil): Variable[TT] =
   ## Input:
   ##   - A x Variable of shape [in_features, batch_size]
   ##   - A weight Variable of shape [out_features, in_features]
@@ -57,12 +76,12 @@ proc linear*[TT](x, weight: Variable[TT], bias: Variable[TT] = nil): Variable[TT
   ##  - Experimental, there is no tests yet for this layer
 
   when compileOption("boundChecks"):
-    if x.value.rank > 2:
+    if input.value.rank > 2:
       raise newException(ValueError, "Tensor must be flattened for a linear layer (features, batch_size)")
 
-    check_ctx(x, weight)
+    check_ctx(input, weight)
     if not bias.isNil:
-      check_ctx(x, bias)
+      check_ctx(input, bias)
 
     # weight has shape: Out_features * In_features
     # bias must have shape: Out_features * 1
@@ -73,23 +92,23 @@ proc linear*[TT](x, weight: Variable[TT], bias: Variable[TT] = nil): Variable[TT
   var gate: LinearGate[TT]
   new gate
   gate.arity = if bias.isNil: 2 else: 3
-  gate.x = x
-  gate.W = weight
-  gate.b = bias
+  gate.input = input
+  gate.weight = weight
+  gate.bias = bias
 
   # Node
   var node: Node[TT]
   new node
 
   node.gate = gate
-  node.parents[0] = x
+  node.parents[0] = input
   node.parents[1] = weight
   if not bias.isNil:
     node.parents[2] = bias
 
-  x.tape.push(node)
+  input.tape.push(node)
 
   # Resulting var
-  result = gate.forward(x)
+  result = gate.forward(input)
   result.ancestor = node
   node.child = result
\ No newline at end of file
diff --git a/src/nn/loss/cross_entropy_losses.nim b/src/nn/loss/cross_entropy_losses.nim
index 89541f627..57f822465 100644
--- a/src/nn/loss/cross_entropy_losses.nim
+++ b/src/nn/loss/cross_entropy_losses.nim
@@ -27,7 +27,7 @@ template gen_cross_entropy_loss(LossType, forward_proc, backward_proc: untyped)
     # target, from Loss
 
   method forward*[TT](self: LossType[TT], a: Variable[TT], target: TT): Variable[TT] {.inline, locks:0.}=
-    # We expect a in shape [features, batch_size]
+    # We expect a in shape [batch_size, features]
 
     new result
     result.tape = a.tape
@@ -74,7 +74,7 @@ type SparseSoftmaxCrossEntropyLoss* {.final.} [TT] = ref object of SparseLoss[TT
   # target, from Loss
 
 method forward*[TT](self: SparseSoftmaxCrossEntropyLoss[TT], a: Variable[TT], target: Tensor[int]): Variable[TT] {.inline, locks:0.}=
-  # We expect a in shape [features, batch_size]
+  # We expect a in shape [batch_size, features]
 
   new result
   result.tape = a.tape
diff --git a/src/nn_primitives/nnp_linear.nim b/src/nn_primitives/nnp_linear.nim
index f3a94faaf..4b3362834 100644
--- a/src/nn_primitives/nnp_linear.nim
+++ b/src/nn_primitives/nnp_linear.nim
@@ -15,35 +15,50 @@
 import  ../tensor/tensor,
         math
 
-# Sigmoid cross-entropy function that works directly on Tensors
-# and provide control without autograd
-
 # Linear forward and backward
-# TODO: layout version to accept both:
-# - batch_first, NCHW (5D: NTCHW or NDCHW)
-# - batch_last, CHWN (5D: CHWNT or CHWND) tensors.
-proc linear*[T](x: var Tensor[T], weight: Tensor[T], bias: Tensor[T]) {.inline.} =
-  x = weight * x
-  x .+= bias
 
-proc linear*[T](x: var Tensor[T], weight: Tensor[T]) {.inline.} =
-  x = weight * x
+proc linear*[T](input, weight: Tensor[T], bias: Tensor[T], output: var Tensor[T]) {.inline.} =
+  # Linear (Dense) forward primitive with bias
+  #   - input tensor shape [batch_size, in_features]
+  #   - weight tensor shape [out_features, in_features]
+  #   - bias tensor shape [batch_size, out_features]
+  # Output does not need to be initialized to 0 or the proper shape, data will be overwritten
+  # Output is: Y = x * W.transpose + b
+  output = input * weight.transpose
+  output .+= bias
+
+proc linear*[T](input, weight: Tensor[T], output: var Tensor[T]) {.inline.} =
+  # Linear (Dense) forward primitive with bias
+  #   - input tensor shape [batch_size, in_features]
+  #   - weight tensor shape [out_features, in_features]
+  # Output does not need to be initialized to 0 or the proper shape, data will be overwritten
+  # Output is: Y = x * W.transpose
+  output = input * weight.transpose
 
 proc linear_backward*[T](
-        gradient: Tensor[T],
-        cached_tensor,
-        weight, bias: Tensor[T],
-        dW, db: var Tensor[T]): Tensor[T] {.inline.} =
-  result = weight.transpose * gradient
-  gemm(gradient, cached_tensor.transpose, dW)
+        input,
+        weight,
+        bias,
+        gradOutput: Tensor[T],
+        gradInput,
+        gradWeight,
+        gradBias: var Tensor[T]) {.inline.} =
+  # Linear (Dense) backward primitive with bias
+  # Tensors are expected in a batch first shape [batch_size, n_features]
+  # var Tensors do not need to be initialized to 0 or the proper shape, data will be overwritten
+  gradInput = gradOutput * weight
+  gradWeight = gradOutput.transpose * input
 
-  db = sum(gradient, axis=0) # https://mlxai.github.io/2017/01/10/a-modular-approach-to-implementing-fully-connected-neural-networks.html
+  gradBias = sum(gradOutput, axis=0) # https://mlxai.github.io/2017/01/10/a-modular-approach-to-implementing-fully-connected-neural-networks.html
 
 proc linear_backward*[T](
-        gradient: Tensor[T],
-        cached_tensor,
-        weight: Tensor[T],
-        dW: var Tensor[T]): Tensor[T] {.inline.} =
-  result = weight.transpose * gradient
-  gemm(gradient, cached_tensor.transpose, dW)
+        input,
+        weight,
+        gradOutput: Tensor[T],
+        gradInput,
+        gradWeight: var Tensor[T]) {.inline.} =
+  # Linear (Dense) backward primitive without bias
+  # Tensors are expected in a batch first shape [batch_size, n_features]
+  gradInput = gradOutput * weight
+  gradWeight = gradOutput.transpose * input
 
diff --git a/src/nn_primitives/nnp_sigmoid_cross_entropy.nim b/src/nn_primitives/nnp_sigmoid_cross_entropy.nim
index a94f83279..684fa68cf 100644
--- a/src/nn_primitives/nnp_sigmoid_cross_entropy.nim
+++ b/src/nn_primitives/nnp_sigmoid_cross_entropy.nim
@@ -29,7 +29,7 @@ proc sigmoid_cross_entropy*[T](input, target: Tensor[T]): T =
   ## Returns:
   ##   - Apply a sigmoid activation and returns the cross-entropy loss.
   ## Shape:
-  ##   - Both the cache and target shape should be [features, batchsize] i.e. number of samples as last dimension
+  ##   - Both the cache and target shape should be [batch_size, features] i.e. number of samples as first dimension
   # TODO: add a `batch_axis` parameter
 
   # TODO: term rewriting macro for auto fusion
@@ -37,7 +37,8 @@ proc sigmoid_cross_entropy*[T](input, target: Tensor[T]): T =
   when compileOption("boundChecks"):
     check_input_target(input, target)
 
-  # input.shape[1] is the batch size
+  let batch_size = input.shape[0]
+
   # ln1p(x) does ln(1 + x) but avoids catastrophic cancellation if x << 1.
 
   # result = 0.T
@@ -50,7 +51,7 @@ proc sigmoid_cross_entropy*[T](input, target: Tensor[T]): T =
       -y * x +  max(x,0) + ln1p(exp(-abs(x))) # This leverage the logsumexp trick to improve numerical stability
 
   # Normalize by batch_size
-  result /= T(input.shape[1])
+  result /= T(batch_size)
 
 proc sigmoid_cross_entropy_backward*[T](
         gradient: Tensor[T] or T,
@@ -63,10 +64,10 @@ proc sigmoid_cross_entropy_backward*[T](
   ##   - A cache tensor that contains data from before the forward pass
   ##   - The target values
   ## Shape:
-  ##   - Both the cache and target shape should be [features, batchsize] i.e. number of samples as last dimension
+  ##   - Both the cache and target shape should be [batch_size, features] i.e. number of samples as first dimension
   # TODO: add a `batch_axis` parameter
 
-  let batch_size = cached_tensor.shape[^1]
+  let batch_size = cached_tensor.shape[0]
 
   # Deal with scalar and tensor gradient
   when gradient is T:
diff --git a/src/nn_primitives/nnp_softmax_cross_entropy.nim b/src/nn_primitives/nnp_softmax_cross_entropy.nim
index c1ec458d3..3fcbfcf34 100644
--- a/src/nn_primitives/nnp_softmax_cross_entropy.nim
+++ b/src/nn_primitives/nnp_softmax_cross_entropy.nim
@@ -25,8 +25,8 @@ proc softmax_cross_entropy*[T](input, target: Tensor[T]): T =
   ## Softmax function + Cross-Entropy loss fused in one layer.
   ##
   ## Input:
-  ##   - A Tensor of shape [predicted_labels_probabilities, batchsize]
-  ##   - The target values of shape [truth_labels_probability, batchsize]
+  ##   - A Tensor of shape [batch_size, predicted_labels_probabilities]
+  ##   - The target values of shape [batchsize, truth_labels_probability]
   ## Returns:
   ##   - Apply a softmax activation and returns the cross-entropy loss.
   ##
@@ -49,11 +49,11 @@ proc softmax_cross_entropy*[T](input, target: Tensor[T]): T =
   when compileOption("boundChecks"):
     check_input_target(input, target)
 
-  let batch_size = input.shape[1]
+  let batch_size = input.shape[0]
   # See at the bottom of the file for explanation/proof
   result = frobenius_inner_prod(input, target)
 
-  let sum_logsumexp = fold_axis_inline(input, T, fold_axis=1) do:
+  let sum_logsumexp = fold_axis_inline(input, T, fold_axis=0) do:
     x = y.logsumexp
   do:
     x += y.logsumexp
@@ -66,7 +66,7 @@ proc sparse_softmax_cross_entropy*[T](input: Tensor[T], target: Tensor[int]): T
   ## Softmax function + Cross-Entropy loss fused in one layer.
   ##
   ## Input:
-  ##   - A Tensor of shape [predicted_labels_probabilities, batchsize]
+  ##   - A Tensor of shape [batchsize, predicted_labels_probabilities]
   ##   - The target values of shape [batchsize] containing the truth label id
   ## Returns:
   ##   - Apply a softmax activation and returns the cross-entropy loss.
@@ -88,7 +88,7 @@ proc sparse_softmax_cross_entropy*[T](input: Tensor[T], target: Tensor[int]): T
 
   # TODO: term rewriting macro for auto fusion
 
-  let batch_size = input.shape[1]
+  let batch_size = input.shape[0]
 
   # TODO proper check
   assert batch_size == target.shape[0]
@@ -96,13 +96,13 @@ proc sparse_softmax_cross_entropy*[T](input: Tensor[T], target: Tensor[int]): T
   # See at the bottom of the file for explanation/proof
   # ∑i(- ti * yi) is either -yi or 0 in the sparse case.
   # Since target holds coordinates: ∑i(- ti * yi) = - yi[ti]
-  for i in 0||(input.shape[1]-1):
-    let lse = input[_,i].logsumexp
+  for i in 0||(input.shape[0]-1):
+    let lse = input[i,_].logsumexp
 
     when not declared(openmp):
-      result += lse - input[target[i], i]
+      result += lse - input[i, target[i]]
     else:
-      let tmp = lse - input[target[i], i]
+      let tmp = lse - input[i, target[i]]
       {.emit:"#pragma omp atomic".}
       {.emit:"`result` += `tmp`;".}
 
@@ -129,10 +129,10 @@ proc softmax_cross_entropy_backward*[T](
   ##   - A cache tensor that contains data from before the forward pass
   ##   - The target values
   ## Shape:
-  ##   - Both the cache and target shape should be [features, batchsize] i.e. number of samples as last dimension
+  ##   - Both the cache and target shape should be [batchsize, features] i.e. number of samples as first dimension
   # TODO: add a `batch_axis` parameter
 
-  let batch_size = cached_tensor.shape[1]
+  let batch_size = cached_tensor.shape[0]
 
   # Deal with scalar and tensor gradient
   when gradient is T:
@@ -140,7 +140,7 @@ proc softmax_cross_entropy_backward*[T](
   elif gradient is Tensor:
     let grad = gradient.data[gradient.offset]
 
-  let axis_max_sumexp = cached_tensor.streaming_max_sumexp(axis = 1).broadcast(cached_tensor.shape)
+  let axis_max_sumexp = cached_tensor.streaming_max_sumexp(axis = 0).broadcast(cached_tensor.shape)
 
   result = map3_inline(cached_tensor, target, axis_max_sumexp):
       grad * (stable_softmax(x, z.max, z.sumexp) - y) / T(batch_size)
@@ -167,18 +167,16 @@ proc sparse_softmax_cross_entropy_backward*[T](
   elif gradient is Tensor:
     let grad = gradient.data[gradient.offset]
 
-  let batch_size = cached_tensor.shape[1]
+  let batch_size = cached_tensor.shape[0]
 
   result = zeros_like(cached_tensor)
   # With sparse target grad * (softmax - y) becomes:
   #   - "grad * (softmax - 1)" for the truth labels
   #   - "grad * softmax for the wrong labels
   for i, truth_idx in enumerate(target):
-    result[truth_idx, i] = -1
-
-  let axis_max_sumexp = cached_tensor.streaming_max_sumexp(axis = 1).broadcast(cached_tensor.shape)
-  # let axis_max_sumexp = cached_tensor.classic_max_sumexp(axis = 1).broadcast(cached_tensor.shape)
+    result[i, truth_idx] = -1
 
+  let axis_max_sumexp = cached_tensor.streaming_max_sumexp(axis = 0).broadcast(cached_tensor.shape)
 
   apply3_inline(result, cached_tensor, axis_max_sumexp):
       grad * (stable_softmax(y, z.max, z.sumexp) + x) / T(batch_size)
diff --git a/tests/nn_primitives/test_nnp_loss.nim b/tests/nn_primitives/test_nnp_loss.nim
index 91532ae17..670b256e7 100644
--- a/tests/nn_primitives/test_nnp_loss.nim
+++ b/tests/nn_primitives/test_nnp_loss.nim
@@ -15,7 +15,7 @@
 import ../../src/arraymancer, unittest
 
 
-suite "Loss functions":
+suite "[NN primitives] Loss functions":
   proc `~=`[T: SomeReal](a, b: T): bool =
     let eps = 2e-5.T
     result = abs(a - b) <= eps
@@ -25,9 +25,9 @@ suite "Loss functions":
     block: # Simple test, no batch
       # https://www.pyimagesearch.com/2016/09/12/softmax-classifiers-explained/
 
-      # Reminder, for now batch_size is the innermost index
-      let predicted = [-3.44, 1.16, -0.81, 3.91].toTensor.reshape(4,1)
-      let truth = [0'f64, 0, 0, 1].toTensor.reshape(4,1)
+      # Creating tensor of shape [batchsize, features]
+      let predicted = [-3.44, 1.16, -0.81, 3.91].toTensor.reshape(1,4)
+      let truth = [0'f64, 0, 0, 1].toTensor.reshape(1,4)
 
       let sce_loss = softmax_cross_entropy(predicted, truth)
       check: sce_loss ~= 0.0709
@@ -64,15 +64,15 @@ suite "Loss functions":
       # Create a sparse label tensor of shape: [batch_size]
       let sparse_labels = randomTensor(batch_size, nb_classes)
 
-      # Create the corresponding dense label tensor of shape [nb_classes, batch_size]
-      var labels = zeros[float64](nb_classes, batch_size)
+      # Create the corresponding dense label tensor of shape [batch_size, nb_classes]
+      var labels = zeros[float64](batch_size, nb_classes)
 
       # Fill in the non-zeros values
       for sample_id, nonzero_idx in enumerate(sparse_labels):
-        labels[nonzero_idx, sample_id] = 1
+        labels[sample_id, nonzero_idx] = 1
 
       # Create a random tensor with predictions:
-      let pred = randomTensor(nb_classes, batch_size, -1.0..1.0)
+      let pred = randomTensor(batch_size, nb_classes, -1.0..1.0)
 
       let sce_loss = softmax_cross_entropy(pred, labels)
       let sparse_sce_loss = sparse_softmax_cross_entropy(pred, sparse_labels)