[MXNET-982] Provide example to illustrate usage of CSVIter in C++ API (…

…#12636) * Adding the example to demonstrate the usage of CSVIter * Addressed the review comments to make the example configurable. Moved the unittests folder in 'examples' directory. * Updated the code to address the cpp lint errors. * Removed the author tag. * Fixing the lint errors and usage message. * Update README file for cpp-package and provide README file for example directory. * Revert "Update README file for cpp-package and provide README file for example directory." This reverts commit 02e784a. These files were part of fix for JIRA issue 1017. These files were mistakenly committed in this PR. * Addressed the review comments regarding usage of atoi and avoiding string copy. * Updated to use strtol instead of atoi
apache · Oct 8, 2018 · 610d79c · 610d79c
1 parent 077253d
commit 610d79c
Show file tree

Hide file tree

Showing 3 changed files with 394 additions and 0 deletions.
diff --git a/cpp-package/example/mlp_csv.cpp b/cpp-package/example/mlp_csv.cpp
@@ -0,0 +1,272 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Example: mlp_csv
+ * Description:
+ * The following example demonstrates how to use CSVIter. This example creates
+ * mlp (multi-layer perceptron) model and trains the MNIST data which is in
+ * CSV format.
+ */
+#include <chrono>
+#include "utils.h"
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace mxnet::cpp;
+
+/*
+ * Implementing the mlp symbol with given hidden units configuration.
+ */
+Symbol mlp(const std::vector<int> &hidden_units) {
+    auto data = Symbol::Variable("data");
+    auto label = Symbol::Variable("label");
+
+    std::vector<Symbol> weights(hidden_units.size());
+    std::vector<Symbol> biases(hidden_units.size());
+    std::vector<Symbol> outputs(hidden_units.size());
+
+    for (size_t i = 0; i < hidden_units.size(); ++i) {
+        weights[i] = Symbol::Variable("w" + std::to_string(i));
+        biases[i] = Symbol::Variable("b" + std::to_string(i));
+        Symbol fc = FullyConnected(
+                                   i == 0? data : outputs[i-1],  // data
+                                   weights[i],
+                                   biases[i],
+                                   hidden_units[i]);
+        outputs[i] = i == hidden_units.size()-1 ? fc : Activation(fc, ActivationActType::kRelu);
+    }
+    return SoftmaxOutput(outputs.back(), label);
+}
+
+/*
+ * Convert the input string of number of hidden units into the vector of integers.
+ */
+std::vector<int> getLayers(const std::string& hidden_units_string) {
+    std::vector<int> hidden_units;
+    char *pNext;
+    int num_unit = strtol(hidden_units_string.c_str(), &pNext, 10);
+    hidden_units.push_back(num_unit);
+    while (*pNext) {
+        num_unit = strtol(pNext, &pNext, 10);
+        hidden_units.push_back(num_unit);
+    }
+    return hidden_units;
+}
+
+void printUsage() {
+    std::cout << "Usage:" << std::endl;
+    std::cout << "mlp_csv --train mnist_training_set.csv --test mnist_test_set.csv --epochs 10 "
+    << "--batch_size 100 --hidden_units \"128 64 64\" [--gpu]" << std::endl;
+    std::cout << "The example uses mnist data in CSV format. The MNIST data in CSV format assumes "
+    << "the column 0 to be label and the rest 784 column to be data." << std::endl;
+    std::cout << "By default, the example uses 'cpu' context. If '--gpu' is specified, "
+    << "program uses 'gpu' context." <<std::endl;
+}
+
+int main(int argc, char** argv) {
+    const int image_size = 28;
+    const int num_mnist_features = image_size * image_size;
+    int batch_size = 100;
+    int max_epoch = 10;
+    const float learning_rate = 0.1;
+    const float weight_decay = 1e-2;
+    bool isGpu = false;
+
+    std::string training_set;
+    std::string test_set;
+    std::string hidden_units_string;
+    int index = 1;
+    while (index < argc) {
+        if (strcmp("--train", argv[index]) == 0) {
+            index++;
+            training_set = argv[index];
+        } else if (strcmp("--test", argv[index]) == 0) {
+            index++;
+            test_set = argv[index];
+        } else if (strcmp("--epochs", argv[index]) == 0) {
+            index++;
+            max_epoch = strtol(argv[index], NULL, 10);
+        } else if (strcmp("--batch_size", argv[index]) == 0) {
+            index++;
+            batch_size = strtol(argv[index], NULL, 10);
+        } else if (strcmp("--hidden_units", argv[index]) == 0) {
+            index++;
+            hidden_units_string = argv[index];
+        } else if (strcmp("--gpu", argv[index]) == 0) {
+            isGpu = true;
+            index++;
+        } else if (strcmp("--help", argv[index]) == 0) {
+            printUsage();
+            return 0;
+        }
+        index++;
+    }
+
+    if (training_set.empty() || test_set.empty() || hidden_units_string.empty()) {
+        std::cout << "ERROR: The mandatory arguments such as path to training and test data or "
+        << "number of hidden units for mlp are not specified." << std::endl << std::endl;
+        printUsage();
+        return 1;
+    }
+
+    std::vector<int> hidden_units = getLayers(hidden_units_string);
+
+    if (hidden_units.empty()) {
+        std::cout << "ERROR: Number of hidden units are not provided in correct format."
+        << "The numbers need to be separated by ' '." << std::endl << std::endl;
+        printUsage();
+        return 1;
+    }
+
+    /*
+     * The MNIST data in CSV format has 785 columns.
+     * The first column is "Label" and rest of the columns contain data.
+     * The mnist_train.csv has 60000 records and mnist_test.csv has
+     * 10000 records.
+     */
+    auto train_iter = MXDataIter("CSVIter")
+    .SetParam("data_csv", training_set)
+    .SetParam("data_shape", Shape(num_mnist_features + 1, 1))
+    .SetParam("batch_size", batch_size)
+    .SetParam("flat", 1)
+    .SetParam("shuffle", 0)
+    .CreateDataIter();
+
+    auto val_iter = MXDataIter("CSVIter")
+    .SetParam("data_csv", test_set)
+    .SetParam("data_shape", Shape(num_mnist_features + 1, 1))
+    .SetParam("batch_size", batch_size)
+    .SetParam("flat", 1)
+    .SetParam("shuffle", 0)
+    .CreateDataIter();
+
+    auto net = mlp(hidden_units);
+
+    Context ctx = Context::cpu();
+    if (isGpu) {
+        ctx = Context::gpu();
+    }
+
+    std::map<std::string, NDArray> args;
+    args["data"] = NDArray(Shape(batch_size, num_mnist_features), ctx);
+    args["label"] = NDArray(Shape(batch_size), ctx);
+    // Let MXNet infer shapes other parameters such as weights
+    net.InferArgsMap(ctx, &args, args);
+
+    // Initialize all parameters with uniform distribution U(-0.01, 0.01)
+    auto initializer = Uniform(0.01);
+    for (auto& arg : args) {
+        // arg.first is parameter name, and arg.second is the value
+        initializer(arg.first, &arg.second);
+    }
+
+    // Create sgd optimiz er
+    Optimizer* opt = OptimizerRegistry::Find("sgd");
+    opt->SetParam("rescale_grad", 1.0/batch_size)
+    ->SetParam("lr", learning_rate)
+    ->SetParam("wd", weight_decay);
+
+    // Create executor by binding parameters to the model
+    auto *exec = net.SimpleBind(ctx, args);
+    auto arg_names = net.ListArguments();
+
+    // Start training
+    for (int iter = 0; iter < max_epoch; ++iter) {
+        int samples = 0;
+        train_iter.Reset();
+
+        auto tic = std::chrono::system_clock::now();
+        while (train_iter.Next()) {
+            samples += batch_size;
+            auto data_batch = train_iter.GetDataBatch();
+
+            /*
+             * The shape of data_batch.data is (batch_size, (num_mnist_features + 1))
+             * Need to reshape this data so that label column can be extracted from this data.
+             */
+            NDArray reshapedData = data_batch.data.Reshape(Shape((num_mnist_features + 1),
+                                                                 batch_size));
+
+            /*
+             * Extract the label data by slicing the first column of the data and
+             * copy it to "label" arg.
+             */
+            reshapedData.Slice(0, 1).Reshape(Shape(batch_size)).CopyTo(&args["label"]);
+
+            /*
+             * Extract the feature data by slicing the columns 1 to 785 of the data and
+             * copy it to "data" arg.
+             */
+            reshapedData.Slice(1, (num_mnist_features + 1)).Reshape(Shape(batch_size,
+                                                                         num_mnist_features))
+                                                           .CopyTo(&args["data"]);
+
+            exec->Forward(true);
+
+            // Compute gradients
+            exec->Backward();
+            // Update parameters
+            for (size_t i = 0; i < arg_names.size(); ++i) {
+                if (arg_names[i] == "data" || arg_names[i] == "label") continue;
+                opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
+            }
+        }
+        auto toc = std::chrono::system_clock::now();
+
+        Accuracy acc;
+        val_iter.Reset();
+        while (val_iter.Next()) {
+            auto data_batch = val_iter.GetDataBatch();
+
+            /*
+             * The shape of data_batch.data is (batch_size, (num_mnist_features + 1))
+             * Need to reshape this data so that label column can be extracted from this data.
+             */
+            NDArray reshapedData = data_batch.data.Reshape(Shape((num_mnist_features + 1),
+                                                                 batch_size));
+
+            /*
+             * Extract the label data by slicing the first column of the data and
+             * copy it to "label" arg.
+             */
+            NDArray labelData = reshapedData.Slice(0, 1).Reshape(Shape(batch_size));
+            labelData.CopyTo(&args["label"]);
+
+            /*
+             * Extract the feature data by slicing the columns 1 to 785 of the data and
+             * copy it to "data" arg.
+             */
+            reshapedData.Slice(1, (num_mnist_features + 1)).Reshape(Shape(batch_size,
+                                                                         num_mnist_features))
+                                                                   .CopyTo(&args["data"]);
+
+            // Forward pass is enough as no gradient is needed when evaluating
+            exec->Forward(false);
+            acc.Update(labelData, exec->outputs[0]);
+        }
+        float duration = std::chrono::duration_cast<std::chrono::milliseconds>
+        (toc - tic).count() / 1000.0;
+        LG << "Epoch[" << iter << "]  " << samples/duration << " samples/sec Accuracy: "
+        << acc.Get();
+    }
+
+    delete exec;
+    MXNotifyShutdown();
+    return 0;
+}
diff --git a/cpp-package/example/mnist_to_csv.py b/cpp-package/example/mnist_to_csv.py
@@ -0,0 +1,59 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Following file converts the mnist data to CSV format.
+# Usage:
+# mnist_to_csv.py train-images-idx3-ubyte train-labels-idx1-ubyte mnist_train.csv 60000
+# mnist_to_csv.py t10k-images-idx3-ubyte t10k-labels-idx1-ubyte mnist_test.csv 10000
+#
+
+import argparse
+
+def convert_to_csv(args):
+    imageFile = open(args.imageFile, "rb")
+    labelFile = open(args.labelFile, "rb")
+    outputFile = open(args.outputFile, "w")
+
+    imageFile.read(16)
+    labelFile.read(8)
+    images = []
+
+    for i in range(args.num_records):
+        image = [ord(labelFile.read(1))]
+        for j in range(28 * 28):
+            image.append(ord(imageFile.read(1)))
+        images.append(image)
+
+    for image in images:
+        outputFile.write(",".join(str(pix) for pix in image) + "\n")
+
+    imageFile.close()
+    outputFile.close()
+    labelFile.close()
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("imageFile", type=str, help="image file in mnist format e.g. train-images-idx3-ubyte")
+    parser.add_argument("labelFile", type=str, help="label file in mnist format e.g train-labels-idx1-ubyte")
+    parser.add_argument("outputFile", type=str, help="Output file in CSV format e.g mnist_train_trial.csv")
+    parser.add_argument("num_records", type=int, help="Number of images in the input files.e.g 60000")
+    args = parser.parse_args()
+
+    try:
+        convert_to_csv(args)
+    except Exception as e:
+        print("Error : Exception {}".format(str(e)))
diff --git a/cpp-package/example/unittests/unit_test_mlp_csv.sh b/cpp-package/example/unittests/unit_test_mlp_csv.sh
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file is a unit test for mlp_csv.cpp example in 'example' directory.
+# The file
+#    1. Downloads the MNIST data,
+#    2. Converts it into CSV format.
+#    3. Runs the mlp_csv example and ensures that the accuracy is more than expected.
+#
+
+#!/bin/bash
+
+set -e # exit on the first error
+export EXE_NAME=mlp_csv
+
+cd $(dirname $(readlink -f $0))/../
+export LD_LIBRARY_PATH=$(readlink -f ../../lib):$LD_LIBRARY_PATH
+
+if [ ! -f ../../build/cpp-package/example/${EXE_NAME} ];
+then
+echo "FAIL: ${EXE_NAME} does not exist"
+exit
+fi
+
+cp ../../build/cpp-package/example/${EXE_NAME} .
+
+./get_data.sh
+python mnist_to_csv.py ./data/mnist_data/train-images-idx3-ubyte ./data/mnist_data/train-labels-idx1-ubyte ./data/mnist_data/mnist_train.csv 60000
+python mnist_to_csv.py ./data/mnist_data/t10k-images-idx3-ubyte ./data/mnist_data/t10k-labels-idx1-ubyte ./data/mnist_data/mnist_test.csv 10000
+
+./${EXE_NAME} --train ./data/mnist_data/mnist_train.csv --test ./data/mnist_data/mnist_test.csv --epochs 10 --batch_size 100 --hidden_units "128 64 10" 2&> ${EXE_NAME}.log
+
+if [ ! -f ${EXE_NAME}.log ];
+then
+echo "FAIL: Log file ${EXE_NAME}.log does not exist."
+exit
+fi
+
+# Obtain the accuracy achieved by mlp model after training with MNIST data in CSV format.
+export Acc_obtained=`grep -oP '.*\K(?<=Accuracy: ).*$' ${EXE_NAME}.log | tail -1 | tr -d '\n'`
+export Acc_expected=0.98
+
+# If the obtained accuracy does not meet the expected accuracy, report the test as FAIL.
+if [ $(echo "$Acc_obtained $Acc_expected" | awk '{printf($1 >= $2) ? 1 : 0}') -eq 1 ] ;
+then
+echo "PASS: ${EXE_NAME} obtained $Acc_obtained accuracy."
+else
+echo "FAIL: Accuracy = $Acc_obtained is less than expected accuracy $Acc_expected."
+fi