Skip to content

Commit

Permalink
added template for data reader to pass conduit node from driver
Browse files Browse the repository at this point in the history
added conduit to cmakelist

fixed error with global_trainer_

added simple conduit datareader to hold conduit node

prototyping use of data_store to hold conduit nodes

fixing bug with input buffers not being sized correctly

fixed problem with unpacking conduit node

moving {trainer, dc, dr, ds setup} and {loading inference samples} to separate functions

extended core API for many different input types

removed old code from first lbann-core impl

added simple run script

Fix things that have drifted in LBANN

Get core-drive compiling again

clang-format batch_functional_inference_algorithm

Steps toward debugging the segfault in the inference algo test

The test no longer segfaults. Now it just fails.

Don't shuffle when setting up for inference

Fix a spacing issue

Updated CMake to install the core driver

Build the core-driver
  • Loading branch information
mrwyattii authored and bvanessen committed Sep 24, 2024
1 parent cae84af commit 008e912
Show file tree
Hide file tree
Showing 19 changed files with 607 additions and 218 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -920,6 +920,7 @@ add_subdirectory(applications/CANDLE/pilot2/tools)
add_subdirectory(applications/ATOM/utils)
add_subdirectory(tests)
add_subdirectory(scripts)
add_subdirectory(core-driver)

################################################################
# Install LBANN
Expand Down
2 changes: 1 addition & 1 deletion cmake/configure_files/LBANNConfig.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ set(LBANN_HAS_DIHYDROGEN @LBANN_HAS_DIHYDROGEN@)
set(LBANN_HAS_DISTCONV @LBANN_HAS_DISTCONV@)
set(LBANN_HAS_DOXYGEN @LBANN_HAS_DOXYGEN@)
set(LBANN_HAS_EMBEDDED_PYTHON @LBANN_HAS_EMBEDDED_PYTHON@)
set(LBANN_HAS_FFTW @LBANN_HAS_FFTW@
set(LBANN_HAS_FFTW @LBANN_HAS_FFTW@)
set(LBANN_HAS_FFTW_FLOAT @LBANN_HAS_FFTW_FLOAT@)
set(LBANN_HAS_FFTW_DOUBLE @LBANN_HAS_FFTW_DOUBLE@)
set(LBANN_HAS_GPU_FP16 @LBANN_HAS_GPU_FP16@)
Expand Down
21 changes: 17 additions & 4 deletions core-driver/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,18 @@
cmake_minimum_required(VERSION 3.18.0)
project(my_lbann_test C CXX)
cmake_minimum_required(VERSION 3.21.0)
project(my_lbann_test CXX)
find_package(LBANN 0.102.0 REQUIRED)
add_executable(Main main.cpp)
target_link_libraries(Main PRIVATE LBANN::lbann)
find_package(Conduit CONFIG REQUIRED)
add_executable(lbann-core main.cpp)
target_link_libraries(lbann-core PRIVATE LBANN::lbann)

#target_link_libraries(lbann-bin lbann)
set_target_properties(lbann-core
PROPERTIES
OUTPUT_NAME lbann-core-driver
RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)

#list(APPEND LBANN_EXE_TGTS lbann-core)

install(TARGETS lbann-core
EXPORT LBANNTargets
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
107 changes: 93 additions & 14 deletions core-driver/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,11 @@
#include <mpi.h>
#include <stdio.h>

// Add test-specific options
void construct_opts(int argc, char **argv) {
auto& arg_parser = lbann::global_argument_parser();
lbann::construct_std_options();
lbann::construct_datastore_options();
arg_parser.add_option("samples",
{"-n"},
"Number of samples to run inference on",
Expand All @@ -52,20 +55,76 @@ void construct_opts(int argc, char **argv) {
"Number of labels in dataset",
10);
arg_parser.add_option("minibatchsize",
{"-mbs"},
{"--mbs"},
"Number of samples in a mini-batch",
16);
arg_parser.add_flag("use_conduit",
{"--conduit"},
"Use Conduit node samples (Default is non-distributed matrix)");
arg_parser.add_flag("use_dist_matrix",
{"--dist"},
"Use Hydrogen distributed matrix (Default is non-distributed matrix)");
arg_parser.add_required_argument<std::string>
("model",
"Directory containing checkpointed model");
arg_parser.parse(argc, argv);
}

El::DistMatrix<float, El::STAR, El::STAR, El::ELEMENT, El::Device::CPU>
random_samples(El::Grid const& g, int n, int c, int h, int w) {
// Generates random samples and labels for mnist data in Hydrogen matrix
std::map<
std::string,
El::Matrix<float, El::Device::CPU>>
mat_mnist_samples(int n, int c, int h, int w)
{
El::Matrix<float, El::Device::CPU>
samples(c * h * w, n);
El::MakeUniform(samples);
El::Matrix<float, El::Device::CPU>
labels(1, n);
El::MakeUniform(labels);
std::map<
std::string,
El::Matrix<float, El::Device::CPU>>
samples_map = {{"data/samples", samples}, {"data/labels", labels}};
return samples_map;
}

// Generates random samples and labels for mnist data in Hydrogen distributed matrix
std::map<
std::string,
El::DistMatrix<float, El::STAR, El::STAR, El::ELEMENT, El::Device::CPU>>
distmat_mnist_samples(El::Grid const& g, int n, int c, int h, int w)
{
El::DistMatrix<float, El::STAR, El::STAR, El::ELEMENT, El::Device::CPU>
samples(n, c * h * w, g);
samples(c * h * w, n, g);
El::MakeUniform(samples);
El::DistMatrix<float, El::STAR, El::STAR, El::ELEMENT, El::Device::CPU>
labels(1, n, g);
El::MakeUniform(labels);
std::map<
std::string,
El::DistMatrix<float, El::STAR, El::STAR, El::ELEMENT, El::Device::CPU>>
samples_map = {{"data/samples", samples}, {"data/labels", labels}};
return samples_map;
}

// Fills array with random values
void random_fill(float *arr, int size, int max_val=255) {
for (int i; i < size; i++) {
arr[i] = (float)(std::rand() % max_val) / (float)max_val;
}
}

// Generates random samples and labels for mnist data in vector of Conduit nodes
std::vector<conduit::Node> conduit_mnist_samples(int n, int c, int h, int w) {
std::vector<conduit::Node> samples(n);
int sample_size = c * h * w;
float this_sample[sample_size];
for (int i; i<n; i++) {
random_fill(this_sample, sample_size);
samples[i]["data/samples"].set(this_sample, sample_size);
samples[i]["data/labels"] = std::rand() % 10;
}
return samples;
}

Expand All @@ -79,10 +138,13 @@ int main(int argc, char **argv) {
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);

// Get input arguments and print values
// Get input arguments, check and print values
construct_opts(argc, argv);
auto& arg_parser = lbann::global_argument_parser();
if (rank == 0) {
if (arg_parser.get<bool>("use_conduit") && arg_parser.get<bool>("use_dist_matrix")) {
LBANN_ERROR("Cannot use conduit node and distributed matrix together, choose one: --conduit --dist");
}
std::stringstream msg;
msg << "Model: " << arg_parser.get<std::string>("model") << std::endl;
msg << "{ N, c, h, w } = { " << arg_parser.get<int>("samples") << ", ";
Expand All @@ -94,8 +156,8 @@ int main(int argc, char **argv) {
std::cout << msg.str();
}

// Load model and run inference on samples
auto lbann_comm = lbann::initialize_lbann(MPI_COMM_WORLD);

auto m = lbann::load_inference_model(lbann_comm.get(),
arg_parser.get<std::string>("model"),
arg_parser.get<int>("minibatchsize"),
Expand All @@ -105,14 +167,31 @@ int main(int argc, char **argv) {
arg_parser.get<int>("width")
},
{arg_parser.get<int>("labels")});
auto samples = random_samples(lbann_comm->get_trainer_grid(),
arg_parser.get<int>("samples"),
arg_parser.get<int>("channels"),
arg_parser.get<int>("height"),
arg_parser.get<int>("width"));
auto labels = lbann::infer(m.get(),
samples,
arg_parser.get<int>("minibatchsize"));

// three options for data generation
if (arg_parser.get<bool>("use_conduit")) {
auto samples = conduit_mnist_samples(arg_parser.get<int>("samples"),
arg_parser.get<int>("channels"),
arg_parser.get<int>("height"),
arg_parser.get<int>("width"));
lbann::set_inference_samples(samples);
} else if (arg_parser.get<bool>("use_dist_matrix")) {
auto samples = distmat_mnist_samples(lbann_comm->get_trainer_grid(),
arg_parser.get<int>("samples"),
arg_parser.get<int>("channels"),
arg_parser.get<int>("height"),
arg_parser.get<int>("width"));
lbann::set_inference_samples(samples);
} else {
auto samples = mat_mnist_samples(
arg_parser.get<int>("samples"),
arg_parser.get<int>("channels"),
arg_parser.get<int>("height"),
arg_parser.get<int>("width"));
lbann::set_inference_samples(samples);
}

auto labels = lbann::inference(m.get());

// Print inference results
if (lbann_comm->am_world_master()) {
Expand Down
10 changes: 10 additions & 0 deletions core-driver/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
export AL_PROGRESS_RANKS_PER_NUMA_NODE=2
export OMP_NUM_THREADS=8
export MV2_USE_RDMA_CM=0

# This should be a checkpointed lenet model
MODEL_LOC="path/to/checkpointed/model"

./Main $MODEL_LOC
./Main $MODEL_LOC --dist
./Main $MODEL_LOC --conduit
1 change: 1 addition & 0 deletions include/lbann/data_ingestion/readers/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ set_full_path(THIS_DIR_HEADERS
metadata.hpp
# Data readers
data_reader_cifar10.hpp
data_reader_conduit.hpp
data_reader_csv.hpp
data_reader_image.hpp
data_reader_HDF5.hpp
Expand Down
72 changes: 72 additions & 0 deletions include/lbann/data_ingestion/readers/data_reader_conduit.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2014-2021, Lawrence Livermore National Security, LLC.
// Produced at the Lawrence Livermore National Laboratory.
// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
// the CONTRIBUTORS file. <[email protected]>
//
// LLNL-CODE-697807.
// All rights reserved.
//
// This file is part of LBANN: Livermore Big Artificial Neural Network
// Toolkit. For details, see http://software.llnl.gov/LBANN or
// https://github.com/LLNL/LBANN.
//
// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
// may not use this file except in compliance with the License. You may
// obtain a copy of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the license.
////////////////////////////////////////////////////////////////////////////////

#ifndef LBANN_DATA_READER_CONDUIT_HPP
#define LBANN_DATA_READER_CONDUIT_HPP

#include "lbann/data_readers/data_reader.hpp"
#include "lbann/data_store/data_store_conduit.hpp"

namespace lbann {
/**
* A generalized data reader for passed in conduit nodes.
*/
class conduit_data_reader : public generic_data_reader
{
public:
conduit_data_reader* copy() const override { return new conduit_data_reader(*this); }
bool has_conduit_output() override { return true; }
void load() override;
bool fetch_conduit_node(conduit::Node& sample, int data_id) override;

void set_data_dims(std::vector<int> dims);
void set_label_dims(std::vector<int> dims);

std::string get_type() const override { return "conduit_data_reader"; }
int get_linearized_data_size() const override {
int data_size = 1;
for(int i : m_data_dims) {
data_size *= i;
}
return data_size;
}
int get_linearized_label_size() const override {
int label_size = 1;
for(int i : m_label_dims) {
label_size *= i;
}
return label_size;
}

protected:
std::vector<int> m_data_dims;
std::vector<int> m_label_dims;

}; // END: class conduit_data_reader

} // namespace lbann

#endif // LBANN_DATA_READER_CONDUIT_HPP
Loading

0 comments on commit 008e912

Please sign in to comment.