From 1024c7af4d488a5580d0b94144cd88bd071ae66d Mon Sep 17 00:00:00 2001 From: mratsim Date: Sat, 16 Dec 2017 20:22:01 +0100 Subject: [PATCH] [NN] Breaking, batch_size is now first (NCHW default) fix https://github.com/mratsim/Arraymancer/issues/132 --- benchmarks/ex01_xor.nim | 35 +++--- examples/ex01_xor_perceptron_from_scratch.nim | 114 ++++-------------- src/nn/layers/linear.nim | 67 ++++++---- src/nn/loss/cross_entropy_losses.nim | 4 +- src/nn_primitives/nnp_linear.nim | 63 ++++++---- .../nnp_sigmoid_cross_entropy.nim | 11 +- .../nnp_softmax_cross_entropy.nim | 34 +++--- tests/nn_primitives/test_nnp_loss.nim | 16 +-- 8 files changed, 158 insertions(+), 186 deletions(-) diff --git a/benchmarks/ex01_xor.nim b/benchmarks/ex01_xor.nim index c6becf772..2f7251b9a 100644 --- a/benchmarks/ex01_xor.nim +++ b/benchmarks/ex01_xor.nim @@ -1,21 +1,27 @@ import ../src/arraymancer +# Learning XOR function with a neural network. + +# Autograd context / neuralnet graph let ctx = newContext Tensor[float32] -let bsz = 32 #batch size +let bsz = 32 # batch size -# We will create a tensor of size 3200 --> 100 batch sizes of 32 -# We create it as int between [0, 2[ (2 excluded) and convert to bool +# We will create a tensor of size 3200 (100 batches of size 32) +# We create it as int between [0, 2[ and convert to bool let x_train_bool = randomTensor([bsz * 100, 2], 2).astype(bool) -# Let's build or truth labels. We need to apply xor between the 2 columns of the tensors +# Let's build our truth labels. We need to apply xor between the 2 columns of the tensors let y_bool = x_train_bool[_,0] xor x_train_bool[_,1] -# Convert to float and transpose so batch_size is last -let x_train = ctx.variable(x_train_bool.astype(float32).transpose) -let y = y_bool.astype(float32).transpose +# Convert to float +let x_train = ctx.variable(x_train_bool.astype(float32)) +let y = y_bool.astype(float32) + +# We will build the following network: +# Input --> Linear(out_features = 3) --> relu --> Linear(out_features = 1) --> Sigmoid --> Cross-Entropy Loss -# First hidden layer of 3 neurons, with 2 features in +# First hidden layer of 3 neurons, shape [3 out_features, 2 in_features] # We initialize with random weights between -1 and 1 let layer_3neurons = ctx.variable( randomTensor(3, 2, 2.0f) .- 1.0f @@ -32,19 +38,18 @@ let optim = newSGD[float32]( layer_3neurons, classifier_layer, 0.01f # 0.01 is the learning rate ) +# Learning loop for epoch in 0..10000: - for batch_id in 0..<100: - # offset in the Tensor (Remember, batch size is last) + # minibatch offset in the Tensor let offset = batch_id * 32 - let x = x_train[_, offset ..< offset + 32] - let target = y[_, offset ..< offset + 32] + let x = x_train[offset ..< offset + 32, _] + let target = y[offset ..< offset + 32, _] # Building the network - let n1 = linear(x, layer_3neurons) - let n1_relu = n1.relu - let n2 = linear(n1_relu, classifier_layer) + let n1 = relu linear(x, layer_3neurons) + let n2 = linear(n1, classifier_layer) let loss = sigmoid_cross_entropy(n2, target) # Compute the gradient (i.e. contribution of each parameter to the loss) diff --git a/examples/ex01_xor_perceptron_from_scratch.nim b/examples/ex01_xor_perceptron_from_scratch.nim index 85783529c..8f2ded869 100644 --- a/examples/ex01_xor_perceptron_from_scratch.nim +++ b/examples/ex01_xor_perceptron_from_scratch.nim @@ -1,82 +1,28 @@ import ../src/arraymancer -# Example multilayer perceptron in Arraymancer. - -# We will use as examples the OR function similar to this article: -# https://blog.dbrgn.ch/2013/3/26/perceptrons-in-python/ - - -# Okay let's start -# With x and y being one sample, the perceptron equation is -# -# Layer 1 -# n1 = relu(a1 * x + b1 * y + c1) # First neuron + relu activation -# n2 = relu(a2 * x + b2 * y + c2) # 2nd neuron + relu activation -# n3 = relu(a3 * x + b3 * y + c3) # 3nd neuron + relu activation -# -# Layer 2 -# classifier = a4 * n1 + b4 * n2 + c4 * n3 -# -# Loss -# loss = cross_entropy(sigmoid(classifier)) - -# In terms of high level layers this becomes: -# Input --> Linear(out_features = 3) --> relu --> Linear(out_features = 1) --> Sigmoid --> Cross-Entropy Loss - -# Let's go +# Learning XOR function with a neural network. -# First create a context that will store backpropagation information +# Autograd context / neuralnet graph let ctx = newContext Tensor[float32] -# We will pass batches of 32 samples -let bsz = 32 #batch size - -# We will create a tensor of size 3200 --> 100 batch sizes of 32 -# We create it as int between [0, 2[ (2 excluded) and convert to bool -let x_train_bool = randomTensor([bsz * 100, 2], 2).astype(bool) # generate batch_size examples of (0,1) combination - -# Let's check the first 32 -echo x_train_bool[0..<32, _] -# Tensor of shape 32x2 of type "bool" on backend "Cpu" -# |true false| -# |true true| -# |false false| -# |false true| -# |false false| -# |false false| -# |false false| -# ... - -# Let's build or truth labels. We need to apply xor between the 2 columns of the tensors +let bsz = 32 # batch size + +# We will create a tensor of size 3200 (100 batches of size 32) +# We create it as int between [0, 2[ and convert to bool +let x_train_bool = randomTensor([bsz * 100, 2], 2).astype(bool) + +# Let's build our truth labels. We need to apply xor between the 2 columns of the tensors let y_bool = x_train_bool[_,0] xor x_train_bool[_,1] +# Convert to float +let x_train = ctx.variable(x_train_bool.astype(float32)) +let y = y_bool.astype(float32) + +# We will build the following network: +# Input --> Linear(out_features = 3) --> relu --> Linear(out_features = 1) --> Sigmoid --> Cross-Entropy Loss -echo y_bool[0..<32, _] -# Tensor of shape 32x1 of type "bool" on backend "Cpu" -# true| -# false| -# false| -# true| -# false| -# false| -# false| -# true| -# false| -# ... - -# Convert to float. -# Important: At the moment, Arraymancer expects batch size to be last -# so we transpose. In the future Arraymancer will be flexible. -let x_train = ctx.variable(x_train_bool.astype(float32).transpose) -let y = y_bool.astype(float32).transpose - -# Now we create layer of neurons W that we will train to reproduce the xor function. -# Weights are of this shape: [W: out_features, in_features] - -# First hidden layer of 3 neurons, with 2 features in +# First hidden layer of 3 neurons, shape [3 out_features, 2 in_features] # We initialize with random weights between -1 and 1 -# (We initialize them between 0.0f and 2.0f and then minus 1.0f) -# .- is the minus broadcasting operator let layer_3neurons = ctx.variable( randomTensor(3, 2, 2.0f) .- 1.0f ) @@ -86,40 +32,28 @@ let layer_3neurons = ctx.variable( let classifier_layer = ctx.variable( randomTensor(1, 3, 2.0f) .- 1.0f ) -# We use Stochastic Gradient Descent as optimizer -# With gradient descent the weigth are updated as follows: -# W -= learning_rate * dW + +# Stochastic Gradient Descent let optim = newSGD[float32]( layer_3neurons, classifier_layer, 0.01f # 0.01 is the learning rate ) -# Now let's setup the training loops. -# First loop is passing the mini-batch, bacpropagating, updating the gradients. -# We do it until the whole x_train tensor has been passed through. -# This is one "epoch". - -# Usually after each epoch we "validate" with a test set that the network was never trained on -# how the network generalized. In this example we won't go there to keep it short. - -# We will do 5 epochs, passing the 32*100 minibatches +# Learning loop for epoch in 0..5: - for batch_id in 0..<100: - # offset in the Tensor (Remember, batch size is last) + # minibatch offset in the Tensor let offset = batch_id * 32 - let x = x_train[_, offset ..< offset + 32] - let target = y[_, offset ..< offset + 32] + let x = x_train[offset ..< offset + 32, _] + let target = y[offset ..< offset + 32, _] # Building the network - let n1 = linear(x, layer_3neurons) - let n1_relu = n1.relu - let n2 = linear(n1_relu, classifier_layer) + let n1 = relu linear(x, layer_3neurons) + let n2 = linear(n1, classifier_layer) let loss = sigmoid_cross_entropy(n2, target) echo "Epoch is:" & $epoch echo "Batch id:" & $batch_id - echo "Loss is:" & $loss.value.data[0] # Compute the gradient (i.e. contribution of each parameter to the loss) diff --git a/src/nn/layers/linear.nim b/src/nn/layers/linear.nim index ff14aee2a..751095131 100644 --- a/src/nn/layers/linear.nim +++ b/src/nn/layers/linear.nim @@ -14,6 +14,7 @@ import ../../private/ast_utils, ../../tensor/tensor, + ../../nn_primitives/nn_primitives, ../../autograd/autograd, ./layer @@ -21,26 +22,44 @@ import ../../private/ast_utils, type LinearGate* {.final.} [TT] = ref object of Gate[TT] ## TODO: use fused AddMatMul gate: C <- alpha AB + beta C - x, W, b: Variable[TT] + input, weight, bias: Variable[TT] -method forward*[TT](self: LinearGate[TT], a: Variable[TT]): Variable[TT] {.inline, locks:0.}= +method forward*[TT](self: LinearGate[TT], input: Variable[TT]): Variable[TT] {.inline, locks:0.}= new result - result.tape = a.tape - result.value = self.W.value * a.value - if not self.b.isNil: - result.value .+= self.b.value # Bias is broadcasted other the whole batch size - result.grad = zeros_like(result.value) - -method backward*[TT](self: LinearGate[TT], gradient: TT): SmallDiffs[TT] {.noInit, inline, locks:0.}= - result[0] = self.W.value.transpose * gradient # grad w.r.t. x - result[1] = gradient * self.x.value.transpose # grad w.r.t. weight + if self.bias.isNil: + linear(input.value, self.weight.value, result.value) + else: + linear(input.value, self.weight.value, self.bias.value, result.value) - if not self.b.isNil: - result[2] = sum(gradient, axis=0) # grad w.r.t. bias - # https://mlxai.github.io/2017/01/10/a-modular-approach-to-implementing-fully-connected-neural-networks.html + result.tape = input.tape + result.grad = zeros_like(result.value) -proc linear*[TT](x, weight: Variable[TT], bias: Variable[TT] = nil): Variable[TT] = +method backward*[TT](self: LinearGate[TT], gradOutput: TT): SmallDiffs[TT] {.noInit, inline, locks:0.}= + # result[0] grad w.r.t. input + # result[1] grad w.r.t. weight + # result[2] grad w.r.t. bias + + if self.bias.isNil: + linear_backward( + self.input.value, + self.weight.value, + gradOutput, + result[0], + result[1] + ) + else: + linear_backward( + self.input.value, + self.weight.value, + self.bias.value, + gradOutput, + result[0], + result[1], + result[2] + ) + +proc linear*[TT](input, weight: Variable[TT], bias: Variable[TT] = nil): Variable[TT] = ## Input: ## - A x Variable of shape [in_features, batch_size] ## - A weight Variable of shape [out_features, in_features] @@ -57,12 +76,12 @@ proc linear*[TT](x, weight: Variable[TT], bias: Variable[TT] = nil): Variable[TT ## - Experimental, there is no tests yet for this layer when compileOption("boundChecks"): - if x.value.rank > 2: + if input.value.rank > 2: raise newException(ValueError, "Tensor must be flattened for a linear layer (features, batch_size)") - check_ctx(x, weight) + check_ctx(input, weight) if not bias.isNil: - check_ctx(x, bias) + check_ctx(input, bias) # weight has shape: Out_features * In_features # bias must have shape: Out_features * 1 @@ -73,23 +92,23 @@ proc linear*[TT](x, weight: Variable[TT], bias: Variable[TT] = nil): Variable[TT var gate: LinearGate[TT] new gate gate.arity = if bias.isNil: 2 else: 3 - gate.x = x - gate.W = weight - gate.b = bias + gate.input = input + gate.weight = weight + gate.bias = bias # Node var node: Node[TT] new node node.gate = gate - node.parents[0] = x + node.parents[0] = input node.parents[1] = weight if not bias.isNil: node.parents[2] = bias - x.tape.push(node) + input.tape.push(node) # Resulting var - result = gate.forward(x) + result = gate.forward(input) result.ancestor = node node.child = result \ No newline at end of file diff --git a/src/nn/loss/cross_entropy_losses.nim b/src/nn/loss/cross_entropy_losses.nim index 89541f627..57f822465 100644 --- a/src/nn/loss/cross_entropy_losses.nim +++ b/src/nn/loss/cross_entropy_losses.nim @@ -27,7 +27,7 @@ template gen_cross_entropy_loss(LossType, forward_proc, backward_proc: untyped) # target, from Loss method forward*[TT](self: LossType[TT], a: Variable[TT], target: TT): Variable[TT] {.inline, locks:0.}= - # We expect a in shape [features, batch_size] + # We expect a in shape [batch_size, features] new result result.tape = a.tape @@ -74,7 +74,7 @@ type SparseSoftmaxCrossEntropyLoss* {.final.} [TT] = ref object of SparseLoss[TT # target, from Loss method forward*[TT](self: SparseSoftmaxCrossEntropyLoss[TT], a: Variable[TT], target: Tensor[int]): Variable[TT] {.inline, locks:0.}= - # We expect a in shape [features, batch_size] + # We expect a in shape [batch_size, features] new result result.tape = a.tape diff --git a/src/nn_primitives/nnp_linear.nim b/src/nn_primitives/nnp_linear.nim index f3a94faaf..4b3362834 100644 --- a/src/nn_primitives/nnp_linear.nim +++ b/src/nn_primitives/nnp_linear.nim @@ -15,35 +15,50 @@ import ../tensor/tensor, math -# Sigmoid cross-entropy function that works directly on Tensors -# and provide control without autograd - # Linear forward and backward -# TODO: layout version to accept both: -# - batch_first, NCHW (5D: NTCHW or NDCHW) -# - batch_last, CHWN (5D: CHWNT or CHWND) tensors. -proc linear*[T](x: var Tensor[T], weight: Tensor[T], bias: Tensor[T]) {.inline.} = - x = weight * x - x .+= bias -proc linear*[T](x: var Tensor[T], weight: Tensor[T]) {.inline.} = - x = weight * x +proc linear*[T](input, weight: Tensor[T], bias: Tensor[T], output: var Tensor[T]) {.inline.} = + # Linear (Dense) forward primitive with bias + # - input tensor shape [batch_size, in_features] + # - weight tensor shape [out_features, in_features] + # - bias tensor shape [batch_size, out_features] + # Output does not need to be initialized to 0 or the proper shape, data will be overwritten + # Output is: Y = x * W.transpose + b + output = input * weight.transpose + output .+= bias + +proc linear*[T](input, weight: Tensor[T], output: var Tensor[T]) {.inline.} = + # Linear (Dense) forward primitive with bias + # - input tensor shape [batch_size, in_features] + # - weight tensor shape [out_features, in_features] + # Output does not need to be initialized to 0 or the proper shape, data will be overwritten + # Output is: Y = x * W.transpose + output = input * weight.transpose proc linear_backward*[T]( - gradient: Tensor[T], - cached_tensor, - weight, bias: Tensor[T], - dW, db: var Tensor[T]): Tensor[T] {.inline.} = - result = weight.transpose * gradient - gemm(gradient, cached_tensor.transpose, dW) + input, + weight, + bias, + gradOutput: Tensor[T], + gradInput, + gradWeight, + gradBias: var Tensor[T]) {.inline.} = + # Linear (Dense) backward primitive with bias + # Tensors are expected in a batch first shape [batch_size, n_features] + # var Tensors do not need to be initialized to 0 or the proper shape, data will be overwritten + gradInput = gradOutput * weight + gradWeight = gradOutput.transpose * input - db = sum(gradient, axis=0) # https://mlxai.github.io/2017/01/10/a-modular-approach-to-implementing-fully-connected-neural-networks.html + gradBias = sum(gradOutput, axis=0) # https://mlxai.github.io/2017/01/10/a-modular-approach-to-implementing-fully-connected-neural-networks.html proc linear_backward*[T]( - gradient: Tensor[T], - cached_tensor, - weight: Tensor[T], - dW: var Tensor[T]): Tensor[T] {.inline.} = - result = weight.transpose * gradient - gemm(gradient, cached_tensor.transpose, dW) + input, + weight, + gradOutput: Tensor[T], + gradInput, + gradWeight: var Tensor[T]) {.inline.} = + # Linear (Dense) backward primitive without bias + # Tensors are expected in a batch first shape [batch_size, n_features] + gradInput = gradOutput * weight + gradWeight = gradOutput.transpose * input diff --git a/src/nn_primitives/nnp_sigmoid_cross_entropy.nim b/src/nn_primitives/nnp_sigmoid_cross_entropy.nim index a94f83279..684fa68cf 100644 --- a/src/nn_primitives/nnp_sigmoid_cross_entropy.nim +++ b/src/nn_primitives/nnp_sigmoid_cross_entropy.nim @@ -29,7 +29,7 @@ proc sigmoid_cross_entropy*[T](input, target: Tensor[T]): T = ## Returns: ## - Apply a sigmoid activation and returns the cross-entropy loss. ## Shape: - ## - Both the cache and target shape should be [features, batchsize] i.e. number of samples as last dimension + ## - Both the cache and target shape should be [batch_size, features] i.e. number of samples as first dimension # TODO: add a `batch_axis` parameter # TODO: term rewriting macro for auto fusion @@ -37,7 +37,8 @@ proc sigmoid_cross_entropy*[T](input, target: Tensor[T]): T = when compileOption("boundChecks"): check_input_target(input, target) - # input.shape[1] is the batch size + let batch_size = input.shape[0] + # ln1p(x) does ln(1 + x) but avoids catastrophic cancellation if x << 1. # result = 0.T @@ -50,7 +51,7 @@ proc sigmoid_cross_entropy*[T](input, target: Tensor[T]): T = -y * x + max(x,0) + ln1p(exp(-abs(x))) # This leverage the logsumexp trick to improve numerical stability # Normalize by batch_size - result /= T(input.shape[1]) + result /= T(batch_size) proc sigmoid_cross_entropy_backward*[T]( gradient: Tensor[T] or T, @@ -63,10 +64,10 @@ proc sigmoid_cross_entropy_backward*[T]( ## - A cache tensor that contains data from before the forward pass ## - The target values ## Shape: - ## - Both the cache and target shape should be [features, batchsize] i.e. number of samples as last dimension + ## - Both the cache and target shape should be [batch_size, features] i.e. number of samples as first dimension # TODO: add a `batch_axis` parameter - let batch_size = cached_tensor.shape[^1] + let batch_size = cached_tensor.shape[0] # Deal with scalar and tensor gradient when gradient is T: diff --git a/src/nn_primitives/nnp_softmax_cross_entropy.nim b/src/nn_primitives/nnp_softmax_cross_entropy.nim index c1ec458d3..3fcbfcf34 100644 --- a/src/nn_primitives/nnp_softmax_cross_entropy.nim +++ b/src/nn_primitives/nnp_softmax_cross_entropy.nim @@ -25,8 +25,8 @@ proc softmax_cross_entropy*[T](input, target: Tensor[T]): T = ## Softmax function + Cross-Entropy loss fused in one layer. ## ## Input: - ## - A Tensor of shape [predicted_labels_probabilities, batchsize] - ## - The target values of shape [truth_labels_probability, batchsize] + ## - A Tensor of shape [batch_size, predicted_labels_probabilities] + ## - The target values of shape [batchsize, truth_labels_probability] ## Returns: ## - Apply a softmax activation and returns the cross-entropy loss. ## @@ -49,11 +49,11 @@ proc softmax_cross_entropy*[T](input, target: Tensor[T]): T = when compileOption("boundChecks"): check_input_target(input, target) - let batch_size = input.shape[1] + let batch_size = input.shape[0] # See at the bottom of the file for explanation/proof result = frobenius_inner_prod(input, target) - let sum_logsumexp = fold_axis_inline(input, T, fold_axis=1) do: + let sum_logsumexp = fold_axis_inline(input, T, fold_axis=0) do: x = y.logsumexp do: x += y.logsumexp @@ -66,7 +66,7 @@ proc sparse_softmax_cross_entropy*[T](input: Tensor[T], target: Tensor[int]): T ## Softmax function + Cross-Entropy loss fused in one layer. ## ## Input: - ## - A Tensor of shape [predicted_labels_probabilities, batchsize] + ## - A Tensor of shape [batchsize, predicted_labels_probabilities] ## - The target values of shape [batchsize] containing the truth label id ## Returns: ## - Apply a softmax activation and returns the cross-entropy loss. @@ -88,7 +88,7 @@ proc sparse_softmax_cross_entropy*[T](input: Tensor[T], target: Tensor[int]): T # TODO: term rewriting macro for auto fusion - let batch_size = input.shape[1] + let batch_size = input.shape[0] # TODO proper check assert batch_size == target.shape[0] @@ -96,13 +96,13 @@ proc sparse_softmax_cross_entropy*[T](input: Tensor[T], target: Tensor[int]): T # See at the bottom of the file for explanation/proof # ∑i(- ti * yi) is either -yi or 0 in the sparse case. # Since target holds coordinates: ∑i(- ti * yi) = - yi[ti] - for i in 0||(input.shape[1]-1): - let lse = input[_,i].logsumexp + for i in 0||(input.shape[0]-1): + let lse = input[i,_].logsumexp when not declared(openmp): - result += lse - input[target[i], i] + result += lse - input[i, target[i]] else: - let tmp = lse - input[target[i], i] + let tmp = lse - input[i, target[i]] {.emit:"#pragma omp atomic".} {.emit:"`result` += `tmp`;".} @@ -129,10 +129,10 @@ proc softmax_cross_entropy_backward*[T]( ## - A cache tensor that contains data from before the forward pass ## - The target values ## Shape: - ## - Both the cache and target shape should be [features, batchsize] i.e. number of samples as last dimension + ## - Both the cache and target shape should be [batchsize, features] i.e. number of samples as first dimension # TODO: add a `batch_axis` parameter - let batch_size = cached_tensor.shape[1] + let batch_size = cached_tensor.shape[0] # Deal with scalar and tensor gradient when gradient is T: @@ -140,7 +140,7 @@ proc softmax_cross_entropy_backward*[T]( elif gradient is Tensor: let grad = gradient.data[gradient.offset] - let axis_max_sumexp = cached_tensor.streaming_max_sumexp(axis = 1).broadcast(cached_tensor.shape) + let axis_max_sumexp = cached_tensor.streaming_max_sumexp(axis = 0).broadcast(cached_tensor.shape) result = map3_inline(cached_tensor, target, axis_max_sumexp): grad * (stable_softmax(x, z.max, z.sumexp) - y) / T(batch_size) @@ -167,18 +167,16 @@ proc sparse_softmax_cross_entropy_backward*[T]( elif gradient is Tensor: let grad = gradient.data[gradient.offset] - let batch_size = cached_tensor.shape[1] + let batch_size = cached_tensor.shape[0] result = zeros_like(cached_tensor) # With sparse target grad * (softmax - y) becomes: # - "grad * (softmax - 1)" for the truth labels # - "grad * softmax for the wrong labels for i, truth_idx in enumerate(target): - result[truth_idx, i] = -1 - - let axis_max_sumexp = cached_tensor.streaming_max_sumexp(axis = 1).broadcast(cached_tensor.shape) - # let axis_max_sumexp = cached_tensor.classic_max_sumexp(axis = 1).broadcast(cached_tensor.shape) + result[i, truth_idx] = -1 + let axis_max_sumexp = cached_tensor.streaming_max_sumexp(axis = 0).broadcast(cached_tensor.shape) apply3_inline(result, cached_tensor, axis_max_sumexp): grad * (stable_softmax(y, z.max, z.sumexp) + x) / T(batch_size) diff --git a/tests/nn_primitives/test_nnp_loss.nim b/tests/nn_primitives/test_nnp_loss.nim index 91532ae17..670b256e7 100644 --- a/tests/nn_primitives/test_nnp_loss.nim +++ b/tests/nn_primitives/test_nnp_loss.nim @@ -15,7 +15,7 @@ import ../../src/arraymancer, unittest -suite "Loss functions": +suite "[NN primitives] Loss functions": proc `~=`[T: SomeReal](a, b: T): bool = let eps = 2e-5.T result = abs(a - b) <= eps @@ -25,9 +25,9 @@ suite "Loss functions": block: # Simple test, no batch # https://www.pyimagesearch.com/2016/09/12/softmax-classifiers-explained/ - # Reminder, for now batch_size is the innermost index - let predicted = [-3.44, 1.16, -0.81, 3.91].toTensor.reshape(4,1) - let truth = [0'f64, 0, 0, 1].toTensor.reshape(4,1) + # Creating tensor of shape [batchsize, features] + let predicted = [-3.44, 1.16, -0.81, 3.91].toTensor.reshape(1,4) + let truth = [0'f64, 0, 0, 1].toTensor.reshape(1,4) let sce_loss = softmax_cross_entropy(predicted, truth) check: sce_loss ~= 0.0709 @@ -64,15 +64,15 @@ suite "Loss functions": # Create a sparse label tensor of shape: [batch_size] let sparse_labels = randomTensor(batch_size, nb_classes) - # Create the corresponding dense label tensor of shape [nb_classes, batch_size] - var labels = zeros[float64](nb_classes, batch_size) + # Create the corresponding dense label tensor of shape [batch_size, nb_classes] + var labels = zeros[float64](batch_size, nb_classes) # Fill in the non-zeros values for sample_id, nonzero_idx in enumerate(sparse_labels): - labels[nonzero_idx, sample_id] = 1 + labels[sample_id, nonzero_idx] = 1 # Create a random tensor with predictions: - let pred = randomTensor(nb_classes, batch_size, -1.0..1.0) + let pred = randomTensor(batch_size, nb_classes, -1.0..1.0) let sce_loss = softmax_cross_entropy(pred, labels) let sparse_sce_loss = sparse_softmax_cross_entropy(pred, sparse_labels)