From 3d3c6171f4726f8cbc06a14bdeabfcd38b0ce3e3 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Tue, 1 Sep 2020 22:09:00 +0000
Subject: [PATCH 1/8] initial commit

---
 CMakeLists.txt                                |   16 +-
 Makefile                                      |   16 +-
 example/extensions/lib_api/Makefile           |    2 +-
 example/extensions/lib_api/init_lib.cc        |    4 +-
 example/extensions/lib_custom_op/Makefile     |    8 +-
 example/extensions/lib_custom_op/gemm_lib.cc  |    4 +-
 example/extensions/lib_custom_op/relu_lib.cu  |    4 +-
 .../lib_custom_op/transposecsr_lib.cc         |    4 +-
 .../lib_custom_op/transposerowsp_lib.cc       |    4 +-
 example/extensions/lib_pass/Makefile          |    2 +-
 example/extensions/lib_pass/pass_lib.cc       |    4 +-
 example/extensions/lib_subgraph/Makefile      |    2 +-
 .../extensions/lib_subgraph/subgraph_lib.cc   |    4 +-
 include/mxnet/lib_api.h                       | 1519 ++--------------
 src/lib_api.cc                                | 1593 +++++++++++++++++
 15 files changed, 1824 insertions(+), 1362 deletions(-)
 create mode 100644 src/lib_api.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7e1ef2a00a76..7704cb7ee9db 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -735,18 +735,18 @@ elseif(MSVC)
 endif()
 
 # extension libraries (custom operators, custom subgraphs) are built by default
-add_library(customop_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/gemm_lib.cc)
-add_library(transposecsr_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/transposecsr_lib.cc)
-add_library(transposerowsp_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/transposerowsp_lib.cc)
-add_library(subgraph_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_subgraph/subgraph_lib.cc)
-add_library(pass_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_pass/pass_lib.cc)
+add_library(customop_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/gemm_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
+add_library(transposecsr_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/transposecsr_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
+add_library(transposerowsp_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/transposerowsp_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
+add_library(subgraph_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_subgraph/subgraph_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
+add_library(pass_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_pass/pass_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
 target_include_directories(customop_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 target_include_directories(transposecsr_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 target_include_directories(transposerowsp_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 target_include_directories(subgraph_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 target_include_directories(pass_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 if(USE_CUDA)
-  add_library(customop_gpu_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/relu_lib.cu)
+  add_library(customop_gpu_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/relu_lib.cu ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
   target_include_directories(customop_gpu_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 endif()
 if(UNIX)
@@ -765,7 +765,9 @@ elseif(MSVC)
   set_target_properties(subgraph_lib PROPERTIES PREFIX "lib")
   set_target_properties(pass_lib PROPERTIES PREFIX "lib")
   if(USE_CUDA)
-    target_compile_options(customop_gpu_lib PUBLIC "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-fPIC>")
+    target_compile_options(customop_gpu_lib PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-LD -MT>")
+    target_compile_options(customop_gpu_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/LD>")
+    target_compile_options(customop_gpu_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/MT>")
     set_target_properties(customop_gpu_lib PROPERTIES PREFIX "lib")
   endif()
 endif()
diff --git a/Makefile b/Makefile
index ad17675ce583..8e2bc8b766c5 100644
--- a/Makefile
+++ b/Makefile
@@ -677,28 +677,28 @@ extension_libs: $(EXT_LIBS)
 
 build/libcustomop_lib.so:
 	@mkdir -p $(@D)
-	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_custom_op/gemm_lib.cc -o $@ -I include/mxnet
+	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_custom_op/gemm_lib.cc src/lib_api.cc -o $@ -I include/mxnet
 build/libcustomop_gpu_lib.so:
 	@mkdir -p $(@D)
-	$(NVCC) -shared -std=c++11 -Xcompiler -fPIC example/extensions/lib_custom_op/relu_lib.cu -o $@ -I include/mxnet
+	$(NVCC) -shared -std=c++11 -Xcompiler -fPIC example/extensions/lib_custom_op/relu_lib.cu src/lib_api.cc -o $@ -I include/mxnet
 build/libsubgraph_lib.so:
 	@mkdir -p $(@D)
-	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_subgraph/subgraph_lib.cc -o $@ -I include/mxnet
+	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_subgraph/subgraph_lib.cc src/lib_api.cc -o $@ -I include/mxnet
 build/libtransposecsr_lib.so:
 	@mkdir -p $(@D)
-	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_custom_op/transposecsr_lib.cc -o $@ -I include/mxnet
+	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_custom_op/transposecsr_lib.cc src/lib_api.cc -o $@ -I include/mxnet
 build/libtransposerowsp_lib.so:
 	@mkdir -p $(@D)
-	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_custom_op/transposerowsp_lib.cc -o $@ -I include/mxnet
+	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_custom_op/transposerowsp_lib.cc src/lib_api.cc -o $@ -I include/mxnet
 build/libcustomop_gpu_lib.so:
 	@mkdir -p $(@D)
-	$(NVCC) -shared -std=c++11 -Xcompiler -fPIC example/extensions/lib_custom_op/relu_lib.cu -o $@ -I include/mxnet
+	$(NVCC) -shared -std=c++11 -Xcompiler -fPIC example/extensions/lib_custom_op/relu_lib.cu src/lib_api.cc -o $@ -I include/mxnet
 build/libsubgraph_lib.so:
 	@mkdir -p $(@D)
-	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_subgraph/subgraph_lib.cc -o $@ -I include/mxnet
+	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_subgraph/subgraph_lib.cc src/lib_api.cc -o $@ -I include/mxnet
 build/libpass_lib.so:
 	@mkdir -p $(@D)
-	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_pass/pass_lib.cc -o $@ -I include/mxnet
+	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_pass/pass_lib.cc src/lib_api.cc -o $@ -I include/mxnet
 
 # Cython build
 cython:
diff --git a/example/extensions/lib_api/Makefile b/example/extensions/lib_api/Makefile
index cb529390b77f..e71e4d8ac114 100644
--- a/example/extensions/lib_api/Makefile
+++ b/example/extensions/lib_api/Makefile
@@ -16,7 +16,7 @@
 # under the License.
 
 all:
-	g++ -std=c++11 -shared -fPIC init_lib.cc -o libinit_lib.so -I ../../../include/mxnet
+	g++ -std=c++11 -shared -fPIC init_lib.cc ../../../src/lib_api.cc -o libinit_lib.so -I ../../../include
 
 test:
 	g++ -std=c++11 -O3 -o libtest libtest.cc -ldl -I ../../../include/mxnet
diff --git a/example/extensions/lib_api/init_lib.cc b/example/extensions/lib_api/init_lib.cc
index 0ed43761fe53..2b4cfdf06944 100644
--- a/example/extensions/lib_api/init_lib.cc
+++ b/example/extensions/lib_api/init_lib.cc
@@ -24,12 +24,12 @@
  */
 
 #include <iostream>
-#include "lib_api.h"
+#include "mxnet/lib_api.h"
 
 using namespace mxnet::ext;
 
 MXReturnValue initialize(int version) {
-  if (version >= 10700) {
+  if (version >= 10800) {
     std::cout << "MXNet version " << version << " supported" << std::endl;
     return MX_SUCCESS;
   } else {
diff --git a/example/extensions/lib_custom_op/Makefile b/example/extensions/lib_custom_op/Makefile
index feded2947ca3..c16d7cd0207e 100644
--- a/example/extensions/lib_custom_op/Makefile
+++ b/example/extensions/lib_custom_op/Makefile
@@ -18,16 +18,16 @@
 all: gemm_lib relu_lib transposecsr_lib transposerowsp_lib
 
 gemm_lib:
-	g++ -shared -fPIC -std=c++11 gemm_lib.cc -o libgemm_lib.so -I ../../../include/mxnet
+	g++ -shared -fPIC -std=c++11 gemm_lib.cc ../../../src/lib_api.cc -o libgemm_lib.so -I ../../../include
 
 relu_lib:
-	nvcc -shared -std=c++11 -Xcompiler -fPIC relu_lib.cu -o librelu_lib.so -I ../../../include/mxnet
+	nvcc -shared -std=c++11 -Xcompiler -fPIC relu_lib.cu ../../../src/lib_api.cc -o librelu_lib.so -I ../../../include
 
 transposecsr_lib:
-	g++ -shared -fPIC -std=c++11 transposecsr_lib.cc -o libtransposecsr_lib.so -I ../../../include/mxnet
+	g++ -shared -fPIC -std=c++11 transposecsr_lib.cc ../../../src/lib_api.cc -o libtransposecsr_lib.so -I ../../../include
 
 transposerowsp_lib:
-	g++ -shared -fPIC -std=c++11 transposerowsp_lib.cc -o libtransposerowsp_lib.so -I ../../../include/mxnet
+	g++ -shared -fPIC -std=c++11 transposerowsp_lib.cc ../../../src/lib_api.cc -o libtransposerowsp_lib.so -I ../../../include
 
 clean:
 	rm -rf libgemm_lib.so librelu_lib.so libtransposecsr_lib.so libtransposerowsp_lib.so
diff --git a/example/extensions/lib_custom_op/gemm_lib.cc b/example/extensions/lib_custom_op/gemm_lib.cc
index 59905c896bef..f8d1d326a008 100644
--- a/example/extensions/lib_custom_op/gemm_lib.cc
+++ b/example/extensions/lib_custom_op/gemm_lib.cc
@@ -24,7 +24,7 @@
  */
 
 #include <iostream>
-#include "lib_api.h"
+#include "mxnet/lib_api.h"
 
 using namespace mxnet::ext;
 
@@ -226,7 +226,7 @@ REGISTER_OP(state_gemm)
 .setCreateOpState(createOpState, "cpu");
 
 MXReturnValue initialize(int version) {
-  if (version >= 10700) {
+  if (version >= 10800) {
     std::cout << "MXNet version " << version << " supported" << std::endl;
     return MX_SUCCESS;
   } else {
diff --git a/example/extensions/lib_custom_op/relu_lib.cu b/example/extensions/lib_custom_op/relu_lib.cu
index 7022c76e6999..e4aa8a3decc3 100644
--- a/example/extensions/lib_custom_op/relu_lib.cu
+++ b/example/extensions/lib_custom_op/relu_lib.cu
@@ -24,7 +24,7 @@
  */
 
 #include <iostream>
-#include "lib_api.h"
+#include "mxnet/lib_api.h"
 
 using namespace mxnet::ext;
 
@@ -261,7 +261,7 @@ REGISTER_OP(my_noisy_relu)
 .setBackward(backwardGPU, "gpu");
 
 MXReturnValue initialize(int version) {
-  if (version >= 10700) {
+  if (version >= 10800) {
     std::cout << "MXNet version " << version << " supported" << std::endl;
     return MX_SUCCESS;
   } else {
diff --git a/example/extensions/lib_custom_op/transposecsr_lib.cc b/example/extensions/lib_custom_op/transposecsr_lib.cc
index d3941d74c969..80053ec55cd5 100644
--- a/example/extensions/lib_custom_op/transposecsr_lib.cc
+++ b/example/extensions/lib_custom_op/transposecsr_lib.cc
@@ -24,7 +24,7 @@
  */
 
 #include <iostream>
-#include "lib_api.h"
+#include "mxnet/lib_api.h"
 
 using namespace mxnet::ext;
 
@@ -192,7 +192,7 @@ REGISTER_OP(my_state_transposecsr)
 .setCreateOpState(createOpState, "cpu");
 
 MXReturnValue initialize(int version) {
-  if (version >= 10700) {
+  if (version >= 10800) {
     std::cout << "MXNet version " << version << " supported" << std::endl;
     return MX_SUCCESS;
   } else {
diff --git a/example/extensions/lib_custom_op/transposerowsp_lib.cc b/example/extensions/lib_custom_op/transposerowsp_lib.cc
index 90ad594d556b..d6addb39c4d3 100644
--- a/example/extensions/lib_custom_op/transposerowsp_lib.cc
+++ b/example/extensions/lib_custom_op/transposerowsp_lib.cc
@@ -24,7 +24,7 @@
  */
 
 #include <iostream>
-#include "lib_api.h"
+#include "mxnet/lib_api.h"
 
 using namespace mxnet::ext;
 
@@ -194,7 +194,7 @@ REGISTER_OP(my_state_transposerowsp)
 .setCreateOpState(createOpState, "cpu");
 
 MXReturnValue initialize(int version) {
-  if (version >= 10700) {
+  if (version >= 10800) {
     std::cout << "MXNet version " << version << " supported" << std::endl;
     return MX_SUCCESS;
   } else {
diff --git a/example/extensions/lib_pass/Makefile b/example/extensions/lib_pass/Makefile
index 759a08c48c89..e555b191ecf5 100644
--- a/example/extensions/lib_pass/Makefile
+++ b/example/extensions/lib_pass/Makefile
@@ -18,7 +18,7 @@
 all: pass_lib
 
 pass_lib:
-	g++ -shared -fPIC -std=c++11 pass_lib.cc -o libpass_lib.so -I ../../../include/mxnet
+	g++ -shared -fPIC -std=c++11 pass_lib.cc ../../../src/lib_api.cc -o libpass_lib.so -I ../../../include
 
 clean:
 	rm -rf libpass_lib.so
diff --git a/example/extensions/lib_pass/pass_lib.cc b/example/extensions/lib_pass/pass_lib.cc
index 5f5137319999..5c7c6cf6498a 100644
--- a/example/extensions/lib_pass/pass_lib.cc
+++ b/example/extensions/lib_pass/pass_lib.cc
@@ -26,7 +26,7 @@
 #include <math.h>
 #include <iostream>
 #include <algorithm>
-#include "lib_api.h"
+#include "mxnet/lib_api.h"
 
 using namespace mxnet::ext;
 
@@ -44,7 +44,7 @@ REGISTER_PASS(myPass)
 .setBody(myPass);
 
 MXReturnValue initialize(int version) {
-  if (version >= 10700) {
+  if (version >= 10800) {
     std::cout << "MXNet version " << version << " supported" << std::endl;
     return MX_SUCCESS;
   } else {
diff --git a/example/extensions/lib_subgraph/Makefile b/example/extensions/lib_subgraph/Makefile
index c45100b69ef7..5449e3af9c58 100644
--- a/example/extensions/lib_subgraph/Makefile
+++ b/example/extensions/lib_subgraph/Makefile
@@ -18,7 +18,7 @@
 all: subgraph_lib
 
 subgraph_lib:
-	g++ -shared -fPIC -std=c++11 subgraph_lib.cc -o libsubgraph_lib.so -I ../../../include/mxnet
+	g++ -shared -fPIC -std=c++11 subgraph_lib.cc ../../../src/lib_api.cc -o libsubgraph_lib.so -I ../../../include
 
 clean:
 	rm -rf libsubgraph_lib.so
diff --git a/example/extensions/lib_subgraph/subgraph_lib.cc b/example/extensions/lib_subgraph/subgraph_lib.cc
index 2f954e092152..51ff77456be1 100644
--- a/example/extensions/lib_subgraph/subgraph_lib.cc
+++ b/example/extensions/lib_subgraph/subgraph_lib.cc
@@ -26,7 +26,7 @@
 #include <math.h>
 #include <iostream>
 #include <algorithm>
-#include "lib_api.h"
+#include "mxnet/lib_api.h"
 
 using namespace mxnet::ext;
 
@@ -320,7 +320,7 @@ REGISTER_PASS(addInputPass)
 .setBody(addInputPass);
 
 MXReturnValue initialize(int version) {
-  if (version >= 10700) {
+  if (version >= 10800) {
     std::cout << "MXNet version " << version << " supported" << std::endl;
     return MX_SUCCESS;
   } else {
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index edab4a405174..57267d420a33 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -48,11 +48,12 @@
 #include <sstream>
 
 #if defined(__NVCC__)
+  #include <cuda_runtime.h>
   #include <curand_kernel.h>
 #endif
 
 /* Make sure to update the version number everytime you make changes */
-#define MX_LIBRARY_VERSION 8
+#define MX_LIBRARY_VERSION 9
 
 /*!
  * \brief For loading multiple custom op libraries in Linux, exporting same symbol multiple
@@ -216,6 +217,33 @@ extern "C" {
 namespace mxnet {
 namespace ext {
 
+/* \brief Class to store error messages from extensions to pass to MXNet */
+class MXerrorMsgs {
+ public:
+  /* \brief get singleton pointer to class */
+  static MXerrorMsgs* get();
+
+  /* \brief add a new error message */
+  std::stringstream& add(const char* file, int line);
+
+  /* \brief return number of error messages */
+  int size();
+
+  /* \brief get error message at index */
+  const std::string* get(int idx);
+
+ private:
+  /*! \brief constructor */
+  MXerrorMsgs() {}
+  /*! \brief destructor */
+  ~MXerrorMsgs() {}
+  /*! \brief map of entries in registry */
+  std::vector<std::stringstream> messages;
+};
+
+// Add a new error message, example: MX_ERROR_MSG << "my error msg";
+#define MX_ERROR_MSG MXerrorMsgs::get()->add(__FILE__, __LINE__)
+
 /*!
  * \brief Tensor data type, consistent with mshadow data type
  */
@@ -248,15 +276,13 @@ enum MXStorageType {
  * dev_id is the device index where the tensor locates
  */
 struct MXContext {
-  MXContext() : dev_type("error"), dev_id(-1) {}
-  explicit MXContext(std::string dev_type_, int dev_id_)
-    : dev_type(dev_type_), dev_id(dev_id_) {}
-  explicit MXContext(const char* dev_type_, int dev_id_)
-    : dev_type(dev_type_), dev_id(dev_id_) {}
-  static MXContext CPU() { return MXContext("cpu", 0); }
-  static MXContext GPU() { return MXContext("gpu", 0); }
-  static MXContext CPU(int dev_id) { return MXContext("cpu", dev_id); }
-  static MXContext GPU(int dev_id) { return MXContext("gpu", dev_id); }
+  MXContext();
+  explicit MXContext(std::string dev_type_, int dev_id_);
+  explicit MXContext(const char* dev_type_, int dev_id_);
+  static MXContext CPU();
+  static MXContext GPU();
+  static MXContext CPU(int dev_id);
+  static MXContext GPU(int dev_id);
 
   std::string dev_type;
   int dev_id;
@@ -286,114 +312,24 @@ struct MXSparse {
   int64_t indptr_len;
 
   void set(void *data_ptr, const int64_t* dims, int ndims, void *idx,
-          int64_t num_idx, void *idx_ptr = nullptr, int64_t num_idx_ptr = 0) {
-    data = data_ptr;
-    // If CSR, num of non-zero elemets is num_idx,
-    // If row sparse, num of elements is num_idx * width.
-    data_len = num_idx;
-    if (!idx_ptr) {
-      for (int i = 1; i < ndims; ++i)
-         data_len *= dims[i];
-    }
-
-    indices = reinterpret_cast<int64_t*>(idx);
-    indices_len = num_idx;
-
-    if (idx_ptr) {
-      indptr = reinterpret_cast<int64_t*>(idx_ptr);
-      indptr_len = num_idx_ptr;
-    }
-  }
+           int64_t num_idx, void *idx_ptr = nullptr, int64_t num_idx_ptr = 0);
 };
 
 /*!
  * \brief Tensor data structure used by custom operator
  */
 struct MXTensor {
-  MXTensor() : data_ptr(nullptr), dtype(kUNSET), verID(0), stype(kDefaultStorage) {}
-  MXTensor(const MXTensor& oth) : data_ptr(oth.data_ptr), shape(oth.shape),
-    dtype(oth.dtype), verID(oth.verID), ctx(oth.ctx), stype(oth.stype) {
-    setDLTensor();
-  }
-  MXTensor(void *data_ptr, const std::vector<int64_t> &shape, MXDType dtype,
-           size_t vID, MXContext mx_ctx, MXStorageType stype = kDefaultStorage)
-  : data_ptr(data_ptr), shape(shape), dtype(dtype), verID(vID), ctx(mx_ctx), stype(stype) {
-    setDLTensor();
-  }
+  MXTensor();
+  MXTensor(const MXTensor& oth);
+  MXTensor(void *data_ptr, std::vector<int64_t> shape, MXDType dtype,
+           size_t vID, MXContext mx_ctx, MXStorageType stype = kDefaultStorage);
 
   /*! \brief populate internal tensor fields */
   void setTensor(void *dptr, MXDType type, const int64_t* dims, int ndims,
-                 size_t vID, MXContext mx_ctx, MXStorageType storage_type) {
-    data_ptr = dptr; dtype = type; verID = vID; ctx = mx_ctx; stype = storage_type;
-    shape.clear();
-    for (int j = 0; j < ndims; j++) {
-      shape.push_back(dims[j]);
-    }
-    setDLTensor();
-  }
+                 size_t vID, MXContext mx_ctx, MXStorageType storage_type);
 
   /*! \brief populate DLTensor fields */
-  void setDLTensor() {
-    dltensor.data = data_ptr;
-    dltensor.ndim = shape.size();
-    dltensor.shape = const_cast<int64_t*>(shape.data());
-    dltensor.strides = nullptr;
-    dltensor.byte_offset = 0;
-    dltensor.dtype.lanes = 1;
-    dltensor.ctx.device_id = ctx.dev_id;
-    if (ctx.dev_type == "cpu")
-      dltensor.ctx.device_type = kDLCPU;
-    else if (ctx.dev_type == "gpu")
-      dltensor.ctx.device_type = kDLGPU;
-    else if (ctx.dev_type == "opencl")
-      dltensor.ctx.device_type = kDLOpenCL;
-    else if (ctx.dev_type == "vulcan")
-      dltensor.ctx.device_type = kDLVulkan;
-    else if (ctx.dev_type == "metal")
-      dltensor.ctx.device_type = kDLMetal;
-    else if (ctx.dev_type == "vpi")
-      dltensor.ctx.device_type = kDLVPI;
-    else if (ctx.dev_type == "rocm")
-      dltensor.ctx.device_type = kDLROCM;
-    else
-      dltensor.ctx.device_type = kDLExtDev;
-    switch (dtype) {
-    case kFloat32:
-      dltensor.dtype.code = kDLFloat;
-      dltensor.dtype.bits = 32;
-      break;
-    case kFloat64:
-      dltensor.dtype.code = kDLFloat;
-      dltensor.dtype.bits = 64;
-      break;
-    case kFloat16:
-      dltensor.dtype.code = kDLFloat;
-      dltensor.dtype.bits = 16;
-      break;
-    case kUint8:
-      dltensor.dtype.code = kDLUInt;
-      dltensor.dtype.bits = 8;
-      break;
-    case kInt32:
-      dltensor.dtype.code = kDLInt;
-      dltensor.dtype.bits = 32;
-      break;
-    case kInt8:
-      dltensor.dtype.code = kDLInt;
-      dltensor.dtype.bits = 8;
-      break;
-    case kInt64:
-      dltensor.dtype.code = kDLInt;
-      dltensor.dtype.bits = 64;
-      break;
-    default:
-      dltensor.dtype.code = 0;
-      dltensor.dtype.bits = 0;
-      throw std::runtime_error("Error! Invalid dtype flag: "
-                               + std::to_string(static_cast<int>(dtype))
-                               + " when constructing MXTensor");
-    }
-  }
+  void setDLTensor();
 
   /*! \brief helper function to cast data pointer */
   template<typename data_type>
@@ -402,24 +338,10 @@ struct MXTensor {
   }
 
   /*! \brief helper function to get data size */
-  inline int64_t size() const {
-    int64_t size = 1;
-    for (unsigned int i = 0; i < shape.size(); i++) {
-      size *= shape[i];
-    }
-    return size;
-  }
+  int64_t size() const;
 
   /*! \brief helper function to compare two MXTensors */
-  inline bool isSame(const MXTensor &oth) const {
-    return data_ptr == oth.data_ptr &&
-           dtype == oth.dtype &&
-           verID == oth.verID &&
-           ctx.dev_type == oth.ctx.dev_type &&
-           ctx.dev_id == oth.ctx.dev_id &&
-           shape == oth.shape &&
-           stype == oth.stype;
-  }
+  bool isSame(const MXTensor &oth) const;
 
   // For dense, data_ptr points to 1D flattened tensor data
   // For sparse, data_ptr points to MXSparse
@@ -473,28 +395,15 @@ class PassResource {
  public:
   PassResource(std::unordered_map<std::string, MXTensor>* new_args,
                std::unordered_map<std::string, MXTensor>* new_aux,
-               nd_malloc_t nd_malloc, const void* nd_alloc)
-    : new_args_(new_args), new_aux_(new_aux), nd_malloc_(nd_malloc), nd_alloc_(nd_alloc) {}
+               nd_malloc_t nd_malloc, const void* nd_alloc);
+
   // allocate new arg param, adds to args map, returns newly allocated tensor
   MXTensor* alloc_arg(const std::string& name, const std::vector<int64_t>& shapes,
-                      const MXContext &ctx, MXDType dtype) const {
-    void* data;
-    nd_malloc_(nd_alloc_, shapes.data(), shapes.size(), ctx.dev_type.c_str(), ctx.dev_id,
-               dtype, name.c_str(), 1, &data);
-    MXTensor tensor(data, shapes, dtype, 0, ctx, kDefaultStorage);
-    (*new_args_)[name] = tensor;
-    return &(new_args_->at(name));
-  }
+                      const MXContext &ctx, MXDType dtype) const;
+
   // allocate new aux param, adds to aux map, returns newly allocated tensor
   MXTensor* alloc_aux(const std::string& name, const std::vector<int64_t>& shapes,
-                      const MXContext &ctx, MXDType dtype) const {
-    void* data;
-    nd_malloc_(nd_alloc_, shapes.data(), shapes.size(), ctx.dev_type.c_str(), ctx.dev_id,
-               dtype, name.c_str(), 0, &data);
-    MXTensor tensor(data, shapes, dtype, 0, ctx, kDefaultStorage);
-    (*new_aux_)[name] = tensor;
-    return &(new_aux_->at(name));
-  }
+                      const MXContext &ctx, MXDType dtype) const;
 
  private:
   std::unordered_map<std::string, MXTensor>* new_args_;
@@ -511,43 +420,30 @@ class OpResource {
   OpResource(xpu_malloc_t cpu_malloc_fp, void* cpu_alloc_fp,
              xpu_malloc_t gpu_malloc_fp, void* gpu_alloc_fp, void* stream,
              sparse_malloc_t sparse_malloc_fp, void* sparse_alloc_fp,
-             void* rng_cpu_states, void* rng_gpu_states)
-    : cpu_malloc(cpu_malloc_fp), gpu_malloc(gpu_malloc_fp),
-      cpu_alloc(cpu_alloc_fp), gpu_alloc(gpu_alloc_fp), cuda_stream(stream),
-      sparse_malloc(sparse_malloc_fp), sparse_alloc(sparse_alloc_fp),
-      rand_cpu_states(rng_cpu_states), rand_gpu_states(rng_gpu_states) {}
+             void* rng_cpu_states, void* rng_gpu_states);
 
   /*! \brief allocate cpu memory controlled by MXNet */
-  void* alloc_cpu(int size) const {
-    return cpu_malloc(cpu_alloc, size);
-  }
+  void* alloc_cpu(int size) const;
 
   /*! \brief allocate gpu memory controlled by MXNet */
-  void* alloc_gpu(int size) const {
-    return gpu_malloc(gpu_alloc, size);
-  }
+  void* alloc_gpu(int size) const;
 
   /*! \brief return the cuda stream object with correct type */
-  mx_stream_t get_cuda_stream() const {
+  inline mx_stream_t get_cuda_stream() const {
     return static_cast<mx_stream_t>(cuda_stream);
   }
 
   /*! \brief allocate sparse memory controlled by MXNet */
-  void alloc_sparse(MXSparse* sparse, int index, int indices_len, int indptr_len = 0) const {
-    sparse_malloc(sparse_alloc, index, indices_len, indptr_len,
-                   &(sparse->data), &(sparse->indices), &(sparse->indptr));
-  }
+  void alloc_sparse(MXSparse* sparse, int index, int indices_len, int indptr_len = 0) const;
 
   /*! \brief get pointer to initialized and seeded random number states located on CPU */
   /* Access each state by states[id], but this id should be <= MX_NUM_CPU_RANDOM_STATES */
-  mx_cpu_rand_t* get_cpu_rand_states() const {
-    return static_cast<mx_cpu_rand_t*>(rand_cpu_states);
-  }
+  mx_cpu_rand_t* get_cpu_rand_states() const;
 
   /*! \brief get pointer to initialized and seeded random number states located on GPU */
   /* Access each state by states[id], but this id should be <= MX_NUM_GPU_RANDOM_STATES */
   /* Note that if you are using cpu build, it will return a nullptr */
-  mx_gpu_rand_t* get_gpu_rand_states() const {
+  inline mx_gpu_rand_t* get_gpu_rand_states() const {
     return static_cast<mx_gpu_rand_t*>(rand_gpu_states);
   }
 
@@ -582,15 +478,7 @@ class OpResource {
  * getShapeAt("[[1]]", 0) returns "[1]"
  * getShapeAt("[[1],[2,3]]", 1) returns "[2,3]"
  */
-std::string getShapeAt(const std::string& shape, unsigned index) {
-  int idx = 1;  // start at 1 to skip the first square bracket [
-  // find the beginning of the output shape for the particular output index
-  for (unsigned x=0; x < index; x++)
-    idx = shape.find("[", idx+1);
-  int stop = shape.find("]", idx);  // find stop index for this output shape
-  // add this shape to the list
-  return shape.substr(idx, stop-idx+1);
-}
+std::string getShapeAt(const std::string& shape, unsigned index);
 
 /* \brief get dtype value from list of dtypes string
  *
@@ -599,15 +487,7 @@ std::string getShapeAt(const std::string& shape, unsigned index) {
  * getDtypeAt("[1]", 0) returns "1"
  * getDtypeAt("[1,2]", 1) returns "2" 
  */
-std::string getDtypeAt(const std::string& dtype, unsigned index) {
-  // find the beginning of the output dtype for the particular output index
-  int idx = 0;
-  for (unsigned x=0; x < index; x++)
-    idx = dtype.find(",", idx+1);
-  int stop = dtype.find(",", idx+1);  // find stop index for this output dtype
-  if (stop == -1) stop = dtype.find("]", idx+1);
-  return dtype.substr(idx+1, stop-idx-1);
-}
+std::string getDtypeAt(const std::string& dtype, unsigned index);
 
 /*!
  * \brief Json utility to parse serialized subgraph symbol
@@ -617,196 +497,41 @@ enum JsonType {ERR, STR, NUM, LIST, MAP};
 
 /*! \brief definition of JSON objects */
 struct JsonVal {
-  JsonVal() : type(ERR), num(-1), str("") {}  // default constructor
+  JsonVal();  // default constructor
   // construct a JSON object by type
-  explicit JsonVal(JsonType t) : type(t), num(-1), str("") {}
+  explicit JsonVal(JsonType t);
   // construct a string JSON object
-  explicit JsonVal(std::string s) : type(STR), num(-1), str(s) {}
+  explicit JsonVal(std::string s);
   // construct a number JSON object
-  explicit JsonVal(int n) : type(NUM), num(n), str(std::to_string(n)) {}
+  explicit JsonVal(int n);
   // complex constructor
-  JsonVal(JsonType t, int n, std::string s) : type(t), num(n), str(s) {}
-  bool operator<(const JsonVal &o) const {
-    // for string JSON objects compare the string
-    if (type == STR) return type == o.type && str < o.str;
-    // for number JSON objects compare the number
-    if (type == NUM) return type == o.type && num < o.num;
-    // for list JSON objects, compare the size of list, and then each object in the list
-    if (type == LIST) {
-      if (list.size() != o.list.size()) return false;
-      for (unsigned int i=0; i< list.size(); i++)
-        if (list[i] < o.list[i])
-          return false;  // if we find an object that doesnt match return
-      return true;  // all objects in lists matched
-    }
-    // for map JSON objects, compare the size of map, and then each key/value in the maps
-    if (type == MAP) {
-      if (map.size() != o.map.size()) return false;
-      for (auto &item : map) {
-        // if one map is missing a key in another return
-        if (o.map.find(item.first) == o.map.end()) return false;
-        if (item.second < o.map.at(item.first)) return false;
-      }
-      return true;
-    }
-    return type < o.type;
-  }
+  JsonVal(JsonType t, int n, std::string s);
+  bool operator<(const JsonVal &o) const;
 
   // convert JSON object back to JSON-compatible string
-  std::string dump() const {
-    std::string ret;
-    switch (type) {
-    case ERR:
-      ret = "json(Error)";
-      break;
-    case STR:
-      ret = "\"" + str + "\"";
-      break;
-    case NUM:
-      ret = str;
-      break;
-    case LIST:
-      ret = "[";
-      for (unsigned i=0; i < list.size(); i++) {
-        auto &item = list[i];
-        ret += item.dump();
-        if (i < list.size()-1)
-          ret += ",";
-      }
-      ret += "]";
-      break;
-    case MAP:
-      ret = "{";
-      unsigned cnt = 0;
-      for (auto &item : map) {
-        ret += item.first.dump() + " : " + item.second.dump();
-        if (cnt++ < map.size()-1)
-          ret += ",";
-      }
-      ret += "}";
-      break;
-    }
-    return ret;
-  }
+  std::string dump() const;
+
   // convert JSON-compatible string to JSON object
-  static JsonVal parse(const std::string& json) {
-    unsigned int idx = 0;
-    return JsonVal::parse(json, &idx);
-  }
+  static JsonVal parse(const std::string& json);
+
   // parse a string JSON object
-  static JsonVal parse_string(const std::string& json, unsigned int* idx) {
-    JsonVal ret(STR);
-    while (*idx < json.size()) {
-      if (json[*idx] == '"') {
-        ++(*idx);
-        return ret;
-      } else {
-        ret.str += json[*idx];
-        ++(*idx);
-      }
-    }
-    std::cout << "Error! Unable to parse string" << std::endl;
-    return JsonVal();
-  }
+  static JsonVal parse_string(const std::string& json, unsigned int* idx);
+
   // parse a number JSON object
-  static JsonVal parse_num(const std::string& json, unsigned int* idx) {
-    JsonVal ret(NUM);
-    while (*idx < json.size()) {
-      if (json[*idx] >= '0' && json[*idx] <= '9') {
-        ret.str += json[*idx];
-        ++(*idx);
-      } else {
-        break;
-      }
-    }
-    ret.num = std::stoi(ret.str);
-    return ret;
-  }
+  static JsonVal parse_num(const std::string& json, unsigned int* idx);
+
   // parse a list of JSON objects
-  static JsonVal parse_list(const std::string& json, unsigned int* idx) {
-    JsonVal ret(LIST);
-    while (*idx < json.size()) {
-      if (json[*idx] == ']') {
-        ++(*idx);
-        return ret;
-      } else {
-        JsonVal item = JsonVal::parse(json, idx);
-        if (item.type != ERR)
-          ret.list.push_back(item);
-      }
-    }
-    std::cout << "Error! Unable to parse list" << std::endl;
-    return JsonVal();
-  }
+  static JsonVal parse_list(const std::string& json, unsigned int* idx);
+
   // parse a map of JSON objects
-  static JsonVal parse_map(const std::string& json, unsigned int* idx) {
-    JsonVal ret(MAP), key;
-    while (*idx < json.size()) {
-      if (json[*idx] == '}') {
-        ++(*idx);
-        return ret;
-      } else {
-        JsonVal item = JsonVal::parse(json, idx);
-        if (key.type == ERR) {
-          key = item;
-        } else {
-          ret.map[key] = item;
-          key.type = ERR;
-        }
-      }
-    }
-    std::cout << "Error! Unable to parse map" << std::endl;
-    return JsonVal();
-  }
+  static JsonVal parse_map(const std::string& json, unsigned int* idx);
+
   // generic parse function
-  static JsonVal parse(const std::string& json, unsigned int *idx) {
-    JsonVal ret;
-    while (*idx < json.size()) {
-      if (json[*idx] == '"') {
-        ++(*idx);
-        ret = JsonVal::parse_string(json, idx);
-      } else if (json[*idx] >= '0' && json[*idx] <= '9') {
-        ret = JsonVal::parse_num(json, idx);
-      } else if (json[*idx] == '[') {
-        ++(*idx);
-        ret = JsonVal::parse_list(json, idx);
-      } else if (json[*idx] == '{') {
-        ++(*idx);
-        ret = JsonVal::parse_map(json, idx);
-      } else if (json[*idx] == ']' || json[*idx] == '}') {return ret;}
-      if (ret.type != ERR) return ret;
-      ++(*idx);
-    }
-    return ret;
-  }
+  static JsonVal parse(const std::string& json, unsigned int *idx);
+
   // debug function to convert data structure to a debugstring
-  std::string toString() const {
-    std::string ret;
-    switch (type) {
-    case ERR:
-      ret = "json(Error)";
-      break;
-    case STR:
-      ret = "json(STR:" + str + ")";
-      break;
-    case NUM:
-      ret = "json(INT:" + str + ")";
-      break;
-    case LIST:
-      ret = "json(LIST:[";
-      for (auto &item : list)
-        ret += item.toString() + ",";
-      ret += "])";
-      break;
-    case MAP:
-      ret = "json(MAP:{";
-      for (auto &item : map)
-        ret += item.first.toString() + " : " + item.second.toString() + ",";
-      ret += "})";
-      break;
-    }
-    return ret;
-  }
+  std::string toString() const;
+
   JsonType type;
   int num;
   std::string str;
@@ -829,25 +554,19 @@ struct NodeEntry {
 // Representation of a node in the graph
 class Node {
  public:
-  Node() {tensor = nullptr;}
+  Node();
+
   // internally set passResource to enable tensor allocation for graph passes
-  void _setPassResource(PassResource* res_) {res = res_;}
+  void _setPassResource(PassResource* res_);
+
   /* \brief allocate an arg tensor for this node */
   void alloc_arg(const std::vector<int64_t>& shapes,
-                 const MXContext &ctx, MXDType dtype) {
-    if (!res)
-      throw std::runtime_error(
-                 "Node not initialized. Cannot use alloc_arg outside of graph passes.");
-    tensor = res->alloc_arg(name, shapes, ctx, dtype);
-  }
+                 const MXContext &ctx, MXDType dtype);
+
   /* \brief allocate an aux tensor for this node */
   void alloc_aux(const std::vector<int64_t>& shapes,
-                 const MXContext &ctx, MXDType dtype) {
-    if (!res)
-      throw std::runtime_error(
-                 "Node not initialized. Cannot use alloc_aux outside of graph passes.");
-    tensor = res->alloc_aux(name, shapes, ctx, dtype);
-  }
+                 const MXContext &ctx, MXDType dtype);
+
   std::string op;  // operator name (ie. Convolution)
   std::string name;  // unique node name (ie. conv_0 or conv_1)
   MXTensor* tensor;  // tensor data for input nodes
@@ -863,297 +582,57 @@ class Node {
 // Representation of the graph
 class Graph {
  public:
-  Graph() : res(nullptr) {}
+  Graph();
+
   /* \brief deleted nodes when deleting the graph */
-  ~Graph() {
-    for (size_t i = 0; i < nodes.size(); i++)
-      delete nodes[i];
-  }
+  ~Graph();
 
   /* \brief create a graph object from an unparsed string */
-  static Graph* fromString(const std::string& json) {
-    JsonVal val = JsonVal::parse(json);
-    return fromJson(val);
-  }
+  static Graph* fromString(const std::string& json);
 
   /* \brief create a graph object from a parsed JSON object */
-  static Graph* fromJson(JsonVal val) {
-    // get nodes list
-    JsonVal nodes = val.map[JsonVal("nodes")];
-    Graph *g = new Graph();
-
-    std::map<int, Node*> nodeMap;
-    // loop over nodes
-    for (size_t i = 0; i < nodes.list.size(); i++) {
-      Node* n = new Node();
-      g->nodes.push_back(n);
-      JsonVal node = nodes.list[i];
-
-      // set the op info
-      n->op = node.map[JsonVal("op")].str;
-      n->name = node.map[JsonVal("name")].str;
-
-      // if op is null it is an input to the graph
-      if (n->op.compare("null") == 0)
-        g->inputs.push_back(n);
-
-      // set attrs
-      JsonVal attributes = node.map[JsonVal("attrs")];
-      for (auto& kv : attributes.map) {
-        n->attrs[kv.first.str] = kv.second.str;
-      }
-
-      // set subgraphs, parsing each into a graph
-      if (node.map.count(JsonVal("subgraphs")) > 0) {
-        JsonVal subgraphs = node.map[JsonVal("subgraphs")];
-        for (auto &subgraph : subgraphs.list) {
-          n->subgraphs.push_back(fromJson(subgraph));
-        }
-      }
-
-      // set node inputs
-      JsonVal node_inputs = node.map[JsonVal("inputs")];
-      n->inputs.resize(node_inputs.list.size());
-      for (size_t j = 0; j < node_inputs.list.size(); j++) {
-        JsonVal input = node_inputs.list[j];
-        NodeEntry& entry = n->inputs[j];
-        // get pointer to other node
-        entry.node = nodeMap[input.list[0].num];
-        // get the other node's output index
-        entry.entry = input.list[1].num;
-        // set other nodes output as connected to this node
-        entry.node->outputs.push_back({n, static_cast<int>(j)});
-      }
-      nodeMap[i] = n;
-    }
-
-    // set graph level outputs
-    JsonVal& heads = val.map[JsonVal("heads")];
-    g->outputs.resize(heads.list.size());
-    for (size_t i = 0; i < heads.list.size(); i++) {
-      JsonVal head = heads.list[i];
-      g->outputs[i].node = nodeMap[head.list[0].num];
-      g->outputs[i].entry = head.list[1].num;
-    }
-
-    // add all attributes to the graph
-    for (auto& kv : val.map) {
-      if (kv.first.str.compare("nodes") != 0 &&
-         kv.first.str.compare("heads") != 0 &&
-         kv.first.str.compare("node_row_ptr") != 0 &&
-         kv.first.str.compare("arg_nodes") != 0) {
-        g->attrs[kv.first.str] = kv.second;
-      }
-    }
-    return g;
-  }
+  static Graph* fromJson(JsonVal val);
 
   /* \brief convert graph object back to JSON object */
-  JsonVal toJson() {
-    // top level object is a map
-    JsonVal val(MAP);
-
-    // add attributes
-    for (auto& kv : attrs) {
-      val.map[JsonVal(kv.first)] = kv.second;
-    }
-
-    // sort graph nodes in topological order, create mapping of node to index
-    std::map<Node*, int> nodeMap;
-    std::vector<Node*> sorted = topological_sort();
-    // nodes are in reverse topological order in the vector (back is first)
-    // so loop from end to front over the vector 'sorted'
-    for (int i = sorted.size()-1; i >= 0; i--) {
-      nodeMap[sorted[i]] = sorted.size()-1-i;
-    }
-
-    // create node_row_ptr entry
-    val.map[JsonVal("node_row_ptr")] = JsonVal(LIST);
-    JsonVal& node_row_ptr = val.map[JsonVal("node_row_ptr")];
-    for (size_t i = 0; i < nodes.size(); i++)
-      node_row_ptr.list.push_back(JsonVal(i));
-
-    // add all input nodes
-    val.map[JsonVal("arg_nodes")] = JsonVal(LIST);
-    JsonVal& arg_nodes = val.map[JsonVal("arg_nodes")];
-    for (size_t i = 0; i < inputs.size(); i++)
-      arg_nodes.list.push_back(JsonVal(nodeMap[inputs[i]]));
-
-    // add all output nodes
-    val.map[JsonVal("heads")] = JsonVal(LIST);
-    JsonVal& heads = val.map[JsonVal("heads")];
-    for (size_t i = 0; i < outputs.size(); i++) {
-      heads.list.push_back(JsonVal(LIST));
-      JsonVal& out = heads.list[i];
-      out.list.push_back(JsonVal(nodeMap[outputs[i].node]));
-      out.list.push_back(JsonVal(outputs[i].entry));
-      out.list.push_back(JsonVal(0));
-    }
-
-    // add all graph nodes
-    val.map[JsonVal("nodes")] = JsonVal(LIST);
-    JsonVal& nodes_ = val.map[JsonVal("nodes")];
-    for (int i = sorted.size()-1; i >= 0; i--) {
-      // each node is a map
-      nodes_.list.push_back(JsonVal(MAP));
-      Node* n = sorted[i];
-      JsonVal& n_ = nodes_.list[nodes_.list.size()-1];
-
-      n_.map[JsonVal("op")] = JsonVal(n->op);
-      n_.map[JsonVal("name")] = JsonVal(n->name);
-      n_.map[JsonVal("inputs")] = JsonVal(LIST);
-
-      // add inputs for this node
-      JsonVal& inputs_ = n_.map[JsonVal("inputs")];
-      for (size_t j = 0; j < n->inputs.size(); j++) {
-        inputs_.list.push_back(JsonVal(LIST));
-        NodeEntry& entry = n->inputs[j];
-        JsonVal& in = inputs_.list[j];
-        in.list.push_back(JsonVal(nodeMap[entry.node]));
-        in.list.push_back(JsonVal(entry.entry));
-        in.list.push_back(JsonVal(0));
-      }
-
-      // add subgraphs for this node, convert each back to JSON
-      if (n->subgraphs.size() > 0) {
-        n_.map[JsonVal("subgraphs")] = JsonVal(LIST);
-        JsonVal &subgraphs_ = n_.map[JsonVal("subgraphs")];
-        for (Graph *subgraph : n->subgraphs) {
-          subgraphs_.list.push_back(subgraph->toJson());
-        }
-      }
-
-      // add attributes for this node
-      n_.map[JsonVal("attrs")] = JsonVal(MAP);
-      JsonVal& attrs_ = n_.map[JsonVal("attrs")];
-      for (auto& kv : n->attrs) {
-        attrs_.map[JsonVal(kv.first)] = JsonVal(kv.second);
-      }
-    }
-    return val;
-  }
+  JsonVal toJson();
 
   /* \brief convert graph object to JSON string */
-  std::string toString() {
-    return toJson().dump();
-  }
+  std::string toString();
 
   /* \brief visits a node "n" */
   void _dfs_util(Node* n, std::unordered_set<Node*>* to_visit,
-                 std::function<void(Node*)> handler) const {
-    to_visit->erase(n);  // remove node now that we're visiting it
-    for (NodeEntry& e : n->outputs) {
-      Node* o = e.node;
-      if (to_visit->count(o) != 0) {
-        _dfs_util(o, to_visit, handler);  // visit neighbor
-      }
-    }
-    handler(n);  // post-order visit this node
-  }
+                 std::function<void(Node*)> handler) const;
 
   /* \brief post-order DFS graph traversal */
-  void DFS(std::function<void(Node*)> handler) const {
-    std::unordered_set<Node*> to_visit;
-    // put all nodes in set to visit
-    for (auto& n : nodes)
-      to_visit.insert(n);
-    // visit all inputs first
-    for (auto& i : inputs)
-      if (to_visit.count(i) != 0)
-        _dfs_util(i, &to_visit, handler);
-    // visit any nodes left
-    while (to_visit.size() > 0)
-      _dfs_util(*(to_visit.begin()), &to_visit, handler);
-  }
+  void DFS(std::function<void(Node*)> handler) const;
 
   /* \brief sort graph nodes in topological order */
-  std::vector<Node*> topological_sort() const {
-    std::vector<Node*> sorted;
-    auto handler = [&](Node* n) {
-      sorted.push_back(n);  // when visiting each node, add it in order to the vector
-    };
-    DFS(handler);
-    return sorted;
-  }
+  std::vector<Node*> topological_sort() const;
 
   /* \brief print out graph details */
-  void print(int indent = 0) const {
-    std::string space = "";
-    for (int i = 0; i < indent; i++) space+=" ";
-
-    std::cout << space << "########### Graph #############" << std::endl;
-    std::cout << space << "attributes: " << std::endl;
-    for (auto &kv : attrs)
-      std::cout << space << "\t" << kv.first << " : " << kv.second.str << std::endl;
-    std::cout << space << "inputs: " << inputs.size() << std::endl;
-    std::cout << space << "outputs: " << outputs.size() << std::endl;
-    std::cout << space << "nodes: " << nodes.size() << std::endl;
-    std::vector<Node*> sorted = topological_sort();
-    // loop over each node and print out its inputs/outputs
-    for (int i = static_cast<int>(sorted.size()-1); i >= 0; i--) {
-      std::cout << space << "Node: " << sorted[i]->name << std::endl;
-      for (size_t j = 0; j < sorted[i]->inputs.size(); j++) {
-        std::cout << space << "\tInput: " << sorted[i]->inputs[j].node->name << " "
-                  << sorted[i]->inputs[j].entry << std::endl;
-      }
-      for (size_t j = 0; j < sorted[i]->outputs.size(); j++) {
-        std::cout << space << "\tOutput: " << sorted[i]->outputs[j].node->name << " "
-                  << sorted[i]->outputs[j].entry << std::endl;
-      }
-      if (sorted[i]->subgraphs.size() > 0) {
-        for (auto &subgraph : sorted[i]->subgraphs) {
-          std::cout << space << "\tSubgraph:" << std::endl;
-          subgraph->print(indent+2);
-        }
-      }
-    }
-    std::cout << space << "###############################" << std::endl;
-  }
+  void print(int indent = 0) const;
 
   /* \brief add a new node to this graph */
-  Node* addNode(const std::string& name, const std::string& op) {
-    Node* n = new Node();
-    n->name = name;
-    n->op = op;
-    if (res)
-      n->_setPassResource(res);
-    return n;
-  }
+  Node* addNode(const std::string& name, const std::string& op);
+
   /* \brief get node at index in graph */
-  Node* getNode(size_t idx) {
-    return nodes[idx];
-  }
+  Node* getNode(size_t idx);
+
   /* \brief get const node at index in const graph */
-  const Node* getNode(size_t idx) const {
-    return nodes.at(idx);
-  }
+  const Node* getNode(size_t idx) const;
+
   /* \brief get attribute on graph */
-  const JsonVal& getAttr(const std::string& key) const {
-    return attrs.at(key);
-  }
+  const JsonVal& getAttr(const std::string& key) const;
+
   /* \brief get number of nodes in the graph */
-  size_t size() const {
-    return nodes.size();
-  }
+  size_t size() const;
+
   // internally set passResource to enable tensor allocation for graph passes
-  void _setPassResource(PassResource* res_) {
-    res = res_;
-    // set passResource for each node
-    for (Node* node : nodes) {
-      node->_setPassResource(res);
-    }
-  }
+  void _setPassResource(PassResource* res_);
+
   // internally set arg/aux params when available
   void _setParams(std::unordered_map<std::string, mxnet::ext::MXTensor>* args,
-                  std::unordered_map<std::string, mxnet::ext::MXTensor>* aux) {
-    // set params for each input node
-    for (Node* node : inputs) {
-      if (args->count(node->name) > 0)
-        node->tensor = &args->at(node->name);
-      else if (aux->count(node->name) > 0)
-        node->tensor = &aux->at(node->name);
-    }
-  }
+                  std::unordered_map<std::string, mxnet::ext::MXTensor>* aux);
 
   std::vector<Node*> inputs;
   std::vector<NodeEntry> outputs;
@@ -1213,7 +692,7 @@ class CustomStatefulOp {
   virtual MXReturnValue Backward(std::vector<MXTensor>* inputs,
                                  std::vector<MXTensor>* outputs,
                                  const OpResource& op_res) {
-    std::cout << "Error! Operator does not support backward" << std::endl;
+    MX_ERROR_MSG << "Error! Operator does not support backward" << std::endl;
     return MX_FAIL;
   }
 };
@@ -1229,30 +708,30 @@ class CustomStatefulOpWrapper {
 
 /*! \brief Custom Operator function templates */
 typedef MXReturnValue (*fcomp_t)(const std::unordered_map<std::string,
-                                                          std::string>& attributes,
+                                 std::string>& attributes,
                                  std::vector<MXTensor>* inputs,
                                  std::vector<MXTensor>* outputs,
                                  const OpResource& res);
 typedef MXReturnValue (*parseAttrs_t)(const std::unordered_map<std::string,
-                                                               std::string>& attributes,
+                                      std::string>& attributes,
                                       int* num_inputs, int* num_outputs);
 typedef MXReturnValue (*inferType_t)(const std::unordered_map<std::string,
-                                                               std::string>& attributes,
+                                     std::string>& attributes,
                                      std::vector<int>* in_types,
                                      std::vector<int>* out_types);
 typedef MXReturnValue (*inferSType_t)(const std::unordered_map<std::string,
-                                                               std::string>& attributes,
+                                      std::string>& attributes,
                                       std::vector<int>* in_storage_types,
                                       std::vector<int>* out_storage_types);
 typedef MXReturnValue (*inferShape_t)(const std::unordered_map<std::string,
-                                                               std::string>& attributes,
+                                      std::string>& attributes,
                                       std::vector<std::vector<unsigned int> >* in_shapes,
                                       std::vector<std::vector<unsigned int> >* out_shapes);
 typedef MXReturnValue (*mutateInputs_t)(const std::unordered_map<std::string,
-                                                                 std::string>& attributes,
+                                        std::string>& attributes,
                                         std::vector<int>* input_indices);
 typedef MXReturnValue (*createOpState_t)(const std::unordered_map<std::string,
-                                                                  std::string>& attributes,
+                                         std::string>& attributes,
                                          CustomStatefulOp**);
 
 /*!
@@ -1260,66 +739,27 @@ typedef MXReturnValue (*createOpState_t)(const std::unordered_map<std::string,
  */
 class CustomOp {
  public:
-  explicit CustomOp(const char* op_name) : name(op_name),
-    parse_attrs(NULL), infer_type(NULL), infer_storage_type(NULL), infer_shape(NULL),
-    mutate_inputs(NULL), isSGop(false) {}
-  CustomOp& setForward(fcomp_t fcomp, const char* ctx) {
-    if (forward_ctx_map.count(ctx) > 0)
-      raiseDuplicateContextError();
-    forward_ctx_map[ctx] = fcomp;
-    return *this;
-  }
-  CustomOp& setBackward(fcomp_t fgrad, const char* ctx) {
-    if (backward_ctx_map.count(ctx) > 0)
-      raiseDuplicateContextError();
-    backward_ctx_map[ctx] = fgrad;
-    return *this;
-  }
-  CustomOp& setParseAttrs(parseAttrs_t func) {
-    parse_attrs = func;
-    return *this;
-  }
-  CustomOp& setInferType(inferType_t func) {
-    infer_type = func;
-    return *this;
-  }
-  CustomOp& setInferSType(inferSType_t func) {
-    infer_storage_type = func;
-    return *this;
-  }
-  CustomOp& setInferShape(inferShape_t func) {
-    infer_shape = func;
-    return *this;
-  }
-  CustomOp& setMutateInputs(mutateInputs_t func) {
-    mutate_inputs = func;
-    return *this;
-  }
-  CustomOp& setCreateOpState(createOpState_t func, const char* ctx) {
-    if (create_op_ctx_map.count(ctx) > 0)
-      raiseDuplicateContextError();
-    create_op_ctx_map[ctx] = func;
-    return *this;
-  }
-  CustomOp& setIsSubgraphOp() {
-    isSGop = true;
-    return *this;
-  }
-  void mapToVector() {
-    for (auto kv : forward_ctx_map) {
-      forward_ctx_cstr.push_back(kv.first);
-      forward_fp.push_back(kv.second);
-    }
-    for (auto kv : backward_ctx_map) {
-      backward_ctx_cstr.push_back(kv.first);
-      backward_fp.push_back(kv.second);
-    }
-    for (auto kv : create_op_ctx_map) {
-      create_op_ctx_cstr.push_back(kv.first);
-      create_op_fp.push_back(kv.second);
-    }
-  }
-  ~CustomOp() {}
+  explicit CustomOp(const char* op_name);
+
+  CustomOp& setForward(fcomp_t fcomp, const char* ctx);
+
+  CustomOp& setBackward(fcomp_t fgrad, const char* ctx);
+
+  CustomOp& setParseAttrs(parseAttrs_t func);
+
+  CustomOp& setInferType(inferType_t func);
+
+  CustomOp& setInferSType(inferSType_t func);
+
+  CustomOp& setInferShape(inferShape_t func);
+
+  CustomOp& setMutateInputs(mutateInputs_t func);
+
+  CustomOp& setCreateOpState(createOpState_t func, const char* ctx);
+
+  CustomOp& setIsSubgraphOp();
+
+  void mapToVector();
 
   /*! \brief operator name */
   const char* name;
@@ -1338,12 +778,7 @@ class CustomOp {
   std::vector<createOpState_t> create_op_fp;
 
  private:
-  void raiseDuplicateContextError() {
-    std::string op_name_str(name);
-    throw std::runtime_error(
-      "Error! Error! Cannot register multiple functions under same context for operator '"
-      + op_name_str + "'");
-  }
+  void raiseDuplicateContextError();
 
   /*! \brief dedup context maps - static string ctx to custom function */
   std::unordered_map<const char*, fcomp_t> forward_ctx_map, backward_ctx_map;
@@ -1359,13 +794,10 @@ typedef MXReturnValue (*graphPass_t)(mxnet::ext::Graph* graph,
  */
 class CustomPass {
  public:
-  CustomPass() : name("ERROR") {}
-  explicit CustomPass(const char* pass_name)
-    : name(pass_name) {}
-  CustomPass& setBody(graphPass_t fn) {
-    pass = fn;
-    return *this;
-  }
+  CustomPass();
+  explicit CustomPass(const char* pass_name);
+
+  CustomPass& setBody(graphPass_t fn);
 
   /*! \brief pass name */
   const char* name;
@@ -1391,48 +823,24 @@ typedef MXReturnValue (*reviewSubgraph_t)(const mxnet::ext::Graph *subgraph, int
  */
 class CustomPartitioner {
  public:
-  CustomPartitioner() : name("ERROR") {}
-  explicit CustomPartitioner(const char* backend_name) :
-    name(backend_name) {}
+  CustomPartitioner();
+
+  explicit CustomPartitioner(const char* backend_name);
+
   CustomPartitioner& addStrategy(const char* prop_name,
-                                 const char* sg_name) {
-    strategies.push_back(prop_name);
-    op_names.push_back(sg_name);
-    return *this;
-  }
-  CustomPartitioner& setSupportedOps(const char* prop_name, supportedOps_t fn) {
-    supported_map[std::string(prop_name)] = fn;
-    return *this;
-  }
-  CustomPartitioner& setCreateSelector(const char* prop_name, createSelector_t fn) {
-    selector_map[std::string(prop_name)] = fn;
-    return *this;
-  }
-  CustomPartitioner& setReviewSubgraph(const char* prop_name, reviewSubgraph_t fn) {
-    review_map[std::string(prop_name)] = fn;
-    return *this;
-  }
-  supportedOps_t getSupportedOps(int stg_id) {
-    std::string prop(strategies[stg_id]);
-    if (supported_map.count(prop) > 0)
-      return supported_map[prop];
-    else
-      return nullptr;
-  }
-  createSelector_t getCreateSelector(int stg_id) {
-    std::string prop(strategies[stg_id]);
-    if (selector_map.count(prop) > 0)
-      return selector_map[prop];
-    else
-      return nullptr;
-  }
-  reviewSubgraph_t getReviewSubgraph(int stg_id) {
-    std::string prop(strategies[stg_id]);
-    if (review_map.count(prop) > 0)
-      return review_map[prop];
-    else
-      return nullptr;
-  }
+                                 const char* sg_name);
+
+  CustomPartitioner& setSupportedOps(const char* prop_name, supportedOps_t fn);
+
+  CustomPartitioner& setCreateSelector(const char* prop_name, createSelector_t fn);
+
+  CustomPartitioner& setReviewSubgraph(const char* prop_name, reviewSubgraph_t fn);
+
+  supportedOps_t getSupportedOps(int stg_id);
+
+  createSelector_t getCreateSelector(int stg_id);
+
+  reviewSubgraph_t getReviewSubgraph(int stg_id);
 
   /*! \brief partitioner name */
   const char* name;
@@ -1519,47 +927,6 @@ class Registry {
   MX_STR_CONCAT(MX_REGISTER_PASS_DEF_(Name), __COUNTER__) = \
     Registry<CustomPass>::get()->add(MX_TOSTRING(Name))
 
-/* \brief Class to store error messages from extensions to pass to MXNet */
-class MXerrorMsgs {
- public:
-  /*!
-   * \brief get singleton pointer to class
-   * \returns pointer to class
-   */
-  static MXerrorMsgs* get() {
-    static MXerrorMsgs inst;
-    return &inst;
-  }
-  /*!
-   * \brief add a new error message
-   */
-  std::stringstream& add(const char* file, int line) {
-    messages.push_back(new std::stringstream());
-    *(messages.back()) << file << "[" << line << "]: ";
-    return *(messages.back());
-  }
-  int size() {
-    return messages.size();
-  }
-  const std::string* get(int idx) {
-    return new std::string(messages.at(idx)->str());
-  }
-
- private:
-  /*! \brief constructor */
-  MXerrorMsgs() {}
-  /*! \brief destructor */
-  ~MXerrorMsgs() {
-    for (auto &msg : messages)
-      delete msg;
-  }
-  /*! \brief map of entries in registry */
-  std::vector<std::stringstream*> messages;
-};
-
-// Add a new error message, example: MX_ERROR_MSG << "my error msg";
-#define MX_ERROR_MSG MXerrorMsgs::get()->add(__FILE__, __LINE__)
-
 /* -------------- BELOW ARE CTYPE FUNCTIONS PROTOTYPES --------------- */
 
 /*!
@@ -1758,14 +1125,10 @@ typedef int (*msgGet_t)(int idx, const char** msg);
 
 extern "C" {
   /*! \brief returns MXNet library version */
-  MX_INT_RET _opVersion() {
-    return MX_LIBRARY_VERSION;
-  }
+  MX_INT_RET _opVersion();
 
   /*! \brief returns number of ops registered in this library */
-  MX_INT_RET _opRegSize() {
-    return mxnet::ext::Registry<mxnet::ext::CustomOp>::get()->size();
-  }
+  MX_INT_RET _opRegSize();
 
   /*! \brief returns operator registration at specified index */
   MX_VOID_RET _opRegGet(int idx, const char** name, int *isSGop,
@@ -1775,170 +1138,32 @@ extern "C" {
                         const char*** create_op_ctx, mxnet::ext::createOpState_t** create_op_fp,
                         int* create_op_count, mxnet::ext::parseAttrs_t* parse,
                         mxnet::ext::inferType_t* type, mxnet::ext::inferSType_t* stype,
-                        mxnet::ext::inferShape_t* shape, mxnet::ext::mutateInputs_t* mutate) {
-    mxnet::ext::CustomOp &op = mxnet::ext::Registry<mxnet::ext::CustomOp>::get()->get(idx);
-    *name = op.name;
-    *parse = op.parse_attrs;
-    *type = op.infer_type;
-    *stype = op.infer_storage_type;
-    *shape = op.infer_shape;
-    *mutate = op.mutate_inputs;
-    *isSGop = op.isSGop;
-    op.mapToVector();
-    *forward_ctx = op.forward_ctx_cstr.data();
-    *forward_fp = op.forward_fp.data();
-    *forward_count = op.forward_fp.size();
-    *backward_ctx = op.backward_ctx_cstr.data();
-    *backward_fp = op.backward_fp.data();
-    *backward_count = op.backward_fp.size();
-    *create_op_ctx = op.create_op_ctx_cstr.data();
-    *create_op_fp = op.create_op_fp.data();
-    *create_op_count = op.create_op_fp.size();
-  }
+                        mxnet::ext::inferShape_t* shape, mxnet::ext::mutateInputs_t* mutate);
 
   /*! \brief calls free from the external library for library allocated arrays */
-  MX_VOID_RET _opCallFree(void* ptr) {
-    free(ptr);
-  }
+  MX_VOID_RET _opCallFree(void* ptr);
 
   /*! \brief returns status of calling parse attributes function for operator from library */
   MX_INT_RET _opCallParseAttrs(mxnet::ext::parseAttrs_t parseAttrs, const char* const* keys,
                                const char* const* vals, int num,
-                               int* num_in, int* num_out) {
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> attrs;
-    for (int i = 0; i < num; i++) {
-      attrs[std::string(keys[i])] = std::string(vals[i]);
-    }
-
-    return parseAttrs(attrs, num_in, num_out);
-  }
+                               int* num_in, int* num_out);
 
   /*! \brief returns status of calling inferShape function for operator from library */
   MX_INT_RET _opCallInferShape(mxnet::ext::inferShape_t inferShape, const char* const* keys,
                                const char* const* vals, int num,
                                unsigned int** inshapes, int* indims, int num_in,
                                unsigned int*** mod_inshapes, int** mod_indims,
-                               unsigned int*** outshapes, int** outdims, int num_out) {
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> attrs;
-    for (int i = 0; i < num; i++) {
-      attrs[std::string(keys[i])] = std::string(vals[i]);
-    }
-
-    // create a vector of shapes for inputs
-    std::vector<std::vector<unsigned int> > in_shapes(num_in);
-    for (int i = 0; i < num_in; i++) {
-      for (int j = 0; j < indims[i]; j++) {
-        in_shapes[i].push_back(inshapes[i][j]);
-      }
-    }
-
-    // create a vector of shapes for outputs
-    std::vector<std::vector<unsigned int> > out_shapes(num_out);
-
-    int retval = inferShape(attrs, &in_shapes, &out_shapes);
-    if (!retval) return retval;
-
-    // allocate space for modified input dims, shape
-    *mod_indims = static_cast<int*>(malloc (num_in * sizeof(int)));
-    *mod_inshapes = static_cast<unsigned**>(malloc (num_in * sizeof(unsigned*)));
-
-    // copy modified input shapes
-    for (int i = 0; i < num_in; i++) {
-      (*mod_indims)[i] = in_shapes[i].size();
-      (*mod_inshapes)[i] = static_cast<unsigned*>(malloc ((*mod_indims)[i] * sizeof(unsigned)));
-      for (int j = 0; j < (*mod_indims)[i]; j++) {
-        (*mod_inshapes)[i][j] = in_shapes[i][j];
-      }
-    }
-
-    // allocate space for output dims, shape
-    *outdims = static_cast<int*>(malloc (num_out * sizeof(int)));
-    *outshapes = static_cast<unsigned**>(malloc (num_out * sizeof(unsigned*)));
-
-    // copy output shapes
-    for (int i = 0; i < num_out; i++) {
-      (*outdims)[i] = out_shapes[i].size();
-      (*outshapes)[i] = static_cast<unsigned*>(malloc ((*outdims)[i] * sizeof(unsigned)));
-      for (int j = 0; j < (*outdims)[i]; j++) {
-        (*outshapes)[i][j] = out_shapes[i][j];
-      }
-    }
-
-    return retval;
-  }
+                               unsigned int*** outshapes, int** outdims, int num_out);
 
   /*! \brief returns status of calling inferType function for operator from library */
   MX_INT_RET _opCallInferType(mxnet::ext::inferType_t inferType, const char* const* keys,
                               const char* const* vals, int num,
-                              int* intypes, int num_in, int* outtypes, int num_out) {
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> attrs;
-    for (int i = 0; i < num; i++) {
-      attrs[std::string(keys[i])] = std::string(vals[i]);
-    }
-
-    // create a vector of types for inputs
-    std::vector<int> in_types(num_in);
-    for (int i = 0; i < num_in; i++) {
-      in_types[i] = intypes[i];
-    }
-
-    // create a vector of types for outputs
-    std::vector<int> out_types(num_out, -1);
-
-    int retval = inferType(attrs, &in_types, &out_types);
-    if (!retval)
-      return retval;
-
-    // copy modified input types
-    for (int i = 0; i < num_in; i++) {
-      intypes[i] = in_types[i];
-    }
-    // copy output types
-    for (int i = 0; i < num_out; i++) {
-      outtypes[i] = out_types[i];
-    }
-
-    return retval;
-  }
+                              int* intypes, int num_in, int* outtypes, int num_out);
 
   /*! \brief returns status of calling inferSType function for operator from library */
   MX_INT_RET _opCallInferSType(mxnet::ext::inferSType_t inferSType, const char* const* keys,
                                const char* const* vals, int num,
-                               int* instypes, int num_in, int* outstypes, int num_out) {
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> attrs;
-    for (int i = 0; i < num; i++) {
-      attrs[std::string(keys[i])] = std::string(vals[i]);
-    }
-
-    // create a vector of types for inputs
-    std::vector<int> in_stypes(num_in);
-    for (int i = 0; i < num_in; i++) {
-      in_stypes[i] = instypes[i];
-    }
-
-    // create a vector of types for outputs
-    std::vector<int> out_stypes(num_out, -1);
-
-    int retval = inferSType(attrs, &in_stypes, &out_stypes);
-
-    if (!retval)
-      return retval;
-
-    // copy modified input storage types
-    for (int i = 0; i < num_in; i++) {
-      instypes[i] = in_stypes[i];
-    }
-    // copy output storage types
-    for (int i = 0; i < num_out; i++) {
-      outstypes[i] = out_stypes[i];
-    }
-
-    return retval;
-  }
+                               int* instypes, int num_in, int* outstypes, int num_out);
 
   /*! \brief returns status of calling Forward/Backward function for operator from library */
   MX_INT_RET _opCallFCompute(mxnet::ext::fcomp_t fcomp, const char* const* keys,
@@ -1956,119 +1181,17 @@ extern "C" {
                              void** in_indptr, void** out_indptr,
                              int64_t* in_indices_shapes, int64_t* out_indices_shapes,
                              int64_t* in_indptr_shapes, int64_t* out_indptr_shapes,
-                             void* rng_cpu_states, void* rng_gpu_states) {
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> attrs;
-    for (int i = 0; i < num; i++) {
-      attrs[std::string(keys[i])] = std::string(vals[i]);
-    }
-
-    // create a vector of tensors for inputs
-    std::vector<mxnet::ext::MXTensor> inputs(num_in);
-    // create a vector for sparse inputs
-    std::vector<mxnet::ext::MXSparse> in_sparse(num_in);
-
-    for (int i = 0; i < num_in; i++) {
-      // Dense representation.
-      if (instypes[i] == 0) {
-        inputs[i].setTensor(indata[i], (mxnet::ext::MXDType)intypes[i], inshapes[i], indims[i],
-                            inIDs[i], mxnet::ext::MXContext(indev_type[i], indev_id[i]),
-                            mxnet::ext::kDefaultStorage);
-      } else {
-        // Sparse representation.
-        mxnet::ext::MXStorageType type;
-        if (instypes[i] == 1) {
-          type = mxnet::ext::kRowSparseStorage;
-          in_sparse[i].set(indata[i], inshapes[i], indims[i], in_indices[i], in_indices_shapes[i]);
-        } else {
-          type = mxnet::ext::kCSRStorage;
-          in_sparse[i].set(indata[i], inshapes[i], indims[i], in_indices[i],
-                           in_indices_shapes[i], in_indptr[i], in_indptr_shapes[i]);
-        }
-        inputs[i].setTensor(reinterpret_cast<void*>(&in_sparse[i]), (mxnet::ext::MXDType)intypes[i],
-                            inshapes[i], indims[i], inIDs[i],
-                            mxnet::ext::MXContext(indev_type[i], indev_id[i]), type);
-      }
-    }
-
-    // create a vector of tensors for outputs
-    std::vector<mxnet::ext::MXTensor> outputs(num_out);
-    std::vector<mxnet::ext::MXSparse> out_sparse(num_out);
-
-    for (int i = 0; i < num_out; i++) {
-      // Dense representation.
-      if (outstypes[i] == 0) {
-        outputs[i].setTensor(outdata[i], (mxnet::ext::MXDType)outtypes[i], outshapes[i], outdims[i],
-                             outIDs[i], mxnet::ext::MXContext(outdev_type[i], outdev_id[i]),
-                             mxnet::ext::kDefaultStorage);
-      } else {
-        // Sparse representation.
-        mxnet::ext::MXStorageType type;
-        if (outstypes[i] == 1) {
-          type = mxnet::ext::kRowSparseStorage;
-          out_sparse[i].set(outdata[i], outshapes[i], outdims[i],
-                            out_indices[i], out_indices_shapes[i]);
-        } else {
-          type = mxnet::ext::kCSRStorage;
-          out_sparse[i].set(outdata[i], outshapes[i], outdims[i], out_indices[i],
-                            out_indices_shapes[i], out_indptr[i], out_indptr_shapes[i]);
-        }
-        outputs[i].setTensor(reinterpret_cast<void*>(&out_sparse[i]),
-                             (mxnet::ext::MXDType)outtypes[i],
-                             outshapes[i], outdims[i], outIDs[i],
-                             mxnet::ext::MXContext(outdev_type[i], outdev_id[i]), type);
-      }
-    }
-
-    mxnet::ext::OpResource res(cpu_malloc, cpu_alloc, gpu_malloc, gpu_alloc,
-                               cuda_stream, sparse_malloc, sparse_alloc,
-                               rng_cpu_states, rng_gpu_states);
-    return fcomp(attrs, &inputs, &outputs, res);
-  }
+                             void* rng_cpu_states, void* rng_gpu_states);
 
   /*! \brief returns status of calling mutateInputs function for operator from library */
   MX_INT_RET _opCallMutateInputs(mxnet::ext::mutateInputs_t mutate, const char* const* keys,
                                  const char* const* vals, int num,
-                                 int** mutate_indices, int* indices_size) {
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> attrs;
-    for (int i = 0; i < num; i++) {
-      attrs[std::string(keys[i])] = std::string(vals[i]);
-    }
-
-    // create a vector of mutate input indices
-    std::vector<int> mut_ind;
-
-    int retval = mutate(attrs, &mut_ind);
-    if (!retval)
-      return retval;
-
-    // output the input indices
-    *indices_size = mut_ind.size();
-    *mutate_indices = static_cast<int*>(malloc (*indices_size * sizeof(int)));
-    for (int i = 0; i < *indices_size; i++) {
-      (*mutate_indices)[i] = mut_ind[i];
-    }
-
-    return retval;
-  }
+                                 int** mutate_indices, int* indices_size);
 
   /*! \brief returns status of calling createStatefulOp function for operator from library */
   MX_INT_RET _opCallCreateOpState(mxnet::ext::createOpState_t create_op, const char* const* keys,
                                   const char* const* vals, int num,
-                                  void** state_op) {
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> attrs;
-    for (int i = 0; i < num; i++) {
-      attrs[std::string(keys[i])] = std::string(vals[i]);
-    }
-
-    // void pointer to hold custom state op instance created in custom library
-    // eventually state_op pointer is populated by instance from custom library
-    mxnet::ext::CustomStatefulOp** op_ptr =
-      reinterpret_cast<mxnet::ext::CustomStatefulOp**>(state_op);
-    return create_op(attrs, op_ptr);
-  }
+                                  void** state_op);
 
   /*! \brief returns status of calling Stateful Forward/Backward for operator from library */
   MX_INT_RET _opCallFStatefulCompute(int is_forward, void* state_op, const int64_t** inshapes,
@@ -2086,194 +1209,48 @@ extern "C" {
                                      void** out_indptr, int64_t* in_indices_shapes,
                                      int64_t* out_indices_shapes, int64_t* in_indptr_shapes,
                                      int64_t* out_indptr_shapes,
-                                     void* rng_cpu_states, void* rng_gpu_states) {
-    // create a vector of tensors for inputs
-    std::vector<mxnet::ext::MXTensor> inputs(num_in);
-    // create a vector for sparse inputs
-    std::vector<mxnet::ext::MXSparse> in_sparse(num_in);
-
-    for (int i = 0; i < num_in; i++) {
-      if (instypes[i] == 0) {
-        // Dense representation.
-        inputs[i].setTensor(indata[i], (mxnet::ext::MXDType)intypes[i], inshapes[i], indims[i],
-                            inIDs[i], mxnet::ext::MXContext(indev_type[i], indev_id[i]),
-                            mxnet::ext::kDefaultStorage);
-      } else {
-        // Sparse representation.
-        mxnet::ext::MXStorageType type;
-        if (instypes[i] == 1) {
-          type = mxnet::ext::kRowSparseStorage;
-          in_sparse[i].set(indata[i], inshapes[i], indims[i], in_indices[i], in_indices_shapes[i]);
-        } else {
-          type = mxnet::ext::kCSRStorage;
-          in_sparse[i].set(indata[i], inshapes[i], indims[i], in_indices[i],
-                           in_indices_shapes[i], in_indptr[i], in_indptr_shapes[i]);
-        }
-        inputs[i].setTensor(reinterpret_cast<void*>(&in_sparse[i]), (mxnet::ext::MXDType)intypes[i],
-                            inshapes[i], indims[i], inIDs[i],
-                            mxnet::ext::MXContext(indev_type[i], indev_id[i]), type);
-      }
-    }
-
-    // create a vector of tensors for outputs
-    std::vector<mxnet::ext::MXTensor> outputs(num_out);
-    // create a vector for sparse outputs
-    std::vector<mxnet::ext::MXSparse> out_sparse(num_out);
-
-    for (int i = 0; i < num_out; i++) {
-      if (outstypes[i] == 0) {
-        // Dense representation.
-        outputs[i].setTensor(outdata[i], (mxnet::ext::MXDType)outtypes[i], outshapes[i], outdims[i],
-                             outIDs[i], mxnet::ext::MXContext(outdev_type[i], outdev_id[i]),
-                             mxnet::ext::kDefaultStorage);
-      } else {
-        // Sparse representation.
-        mxnet::ext::MXStorageType type;
-        if (outstypes[i] == 1) {
-          type = mxnet::ext::kRowSparseStorage;
-          out_sparse[i].set(outdata[i], outshapes[i], outdims[i], out_indices[i],
-                            out_indices_shapes[i]);
-        } else {
-          type = mxnet::ext::kCSRStorage;
-          out_sparse[i].set(outdata[i], outshapes[i], outdims[i], out_indices[i],
-                            out_indices_shapes[i], out_indptr[i], out_indptr_shapes[i]);
-        }
-        outputs[i].setTensor(reinterpret_cast<void*>(&out_sparse[i]),
-                             (mxnet::ext::MXDType)outtypes[i],
-                             outshapes[i], outdims[i], outIDs[i],
-                             mxnet::ext::MXContext(outdev_type[i], outdev_id[i]), type);
-      }
-    }
-
-    mxnet::ext::OpResource res(cpu_malloc, cpu_alloc, gpu_malloc, gpu_alloc,
-                               stream, sparse_malloc, sparse_alloc, rng_cpu_states, rng_gpu_states);
-
-    mxnet::ext::CustomStatefulOp* op_ptr =
-      reinterpret_cast<mxnet::ext::CustomStatefulOp*>(state_op);
-    if (is_forward) {
-      return op_ptr->Forward(&inputs, &outputs, res);
-    }
-    return op_ptr->Backward(&inputs, &outputs, res);
-  }
+                                     void* rng_cpu_states, void* rng_gpu_states);
 
   /*! \brief returns number of partitioners registered in this library */
-  MX_INT_RET _partRegSize() {
-    return mxnet::ext::Registry<mxnet::ext::CustomPartitioner>::get()->size();
-  }
+  MX_INT_RET _partRegSize();
 
   /* returns number of strategies registered for partitioner
    * at specified index */
-  MX_INT_RET _partRegGetCount(int idx, const char** name) {
-    mxnet::ext::CustomPartitioner part =
-      mxnet::ext::Registry<mxnet::ext::CustomPartitioner>::get()->get(idx);
-    *name = part.name;
-    return part.strategies.size();
-  }
+  MX_INT_RET _partRegGetCount(int idx, const char** name);
 
   /*! \brief returns partitioner registration at specified index */
   MX_VOID_RET _partRegGet(int part_idx, int stg_idx, const char** strategy,
                           mxnet::ext::supportedOps_t* supportedOps,
                           mxnet::ext::createSelector_t* createSelector,
-                          mxnet::ext::reviewSubgraph_t* reviewSubgraph, const char** op_name) {
-    mxnet::ext::CustomPartitioner part =
-      mxnet::ext::Registry<mxnet::ext::CustomPartitioner>::get()->get(part_idx);
-    *strategy = part.strategies[stg_idx];
-    *op_name = part.op_names[stg_idx];
-    *supportedOps = part.getSupportedOps(stg_idx);
-    *createSelector = part.getCreateSelector(stg_idx);
-    *reviewSubgraph = part.getReviewSubgraph(stg_idx);
-  }
+                          mxnet::ext::reviewSubgraph_t* reviewSubgraph, const char** op_name);
 
   /*! \brief returns status of calling supported ops function from library */
   MX_INT_RET _partCallSupportedOps(mxnet::ext::supportedOps_t supportedOps, const char *json,
                                    int num_ids, int *ids, const char* const* opt_keys,
-                                   const char* const* opt_vals, int num_opts) {
-    mxnet::ext::Graph *graph = mxnet::ext::Graph::fromString(json);
-    // create map of options from list
-    std::unordered_map<std::string, std::string> opts;
-    for (int i = 0; i < num_opts; i++)
-      opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
-
-    // create array of subgraph IDs for operator support
-    std::vector<int> _ids(num_ids, -2);
-    // call user's supportedOps function
-    mxnet::ext::MXReturnValue retval = supportedOps(graph, &_ids, opts);
-    if (!retval) return retval;
-
-    // copy bools in ids to ints
-    for (int i = 0; i < num_ids; i++)
-      ids[i] = _ids[i];
-
-    return retval;
-  }
+                                   const char* const* opt_vals, int num_opts);
 
   /*! \brief returns status of calling create selector function from library */
   MX_INT_RET _partCallCreateSelector(mxnet::ext::createSelector_t createSelector, const char *json,
                                      void** selector, const char* const* opt_keys,
-                                     const char* const* opt_vals, int num_opts) {
-    mxnet::ext::Graph *graph = mxnet::ext::Graph::fromString(json);
-    // create map of options from list
-    std::unordered_map<std::string, std::string> opts;
-    for (int i = 0; i < num_opts; i++)
-      opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
-
-    // void pointer to hold selector instance created in custom library
-    // eventually pointer is populated by instance from custom library
-    mxnet::ext::CustomOpSelector** sel_ptr =
-      reinterpret_cast<mxnet::ext::CustomOpSelector**>(selector);
-
-    // call user's createSelector function
-    return createSelector(graph, sel_ptr, opts);
-  }
+                                     const char* const* opt_vals, int num_opts);
 
   /*! \brief returns status of calling select function from library */
-  MX_VOID_RET _partCallSelect(void* sel_inst, int nodeID, int* selected) {
-    mxnet::ext::CustomOpSelector* sel_ptr =
-      reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
-    *selected = sel_ptr->Select(nodeID);
-  }
+  MX_VOID_RET _partCallSelect(void* sel_inst, int nodeID, int* selected);
 
   /*! \brief returns status of calling select input function from library */
   MX_VOID_RET _partCallSelectInput(void* sel_inst, int nodeID,
-                                  int input_nodeID, int* selected) {
-    mxnet::ext::CustomOpSelector* sel_ptr =
-      reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
-    *selected = sel_ptr->SelectInput(nodeID, input_nodeID);
-  }
+                                   int input_nodeID, int* selected);
 
   /*! \brief returns status of calling select output function from library */
   MX_VOID_RET _partCallSelectOutput(void* sel_inst, int nodeID,
-                                    int output_nodeID, int* selected) {
-    mxnet::ext::CustomOpSelector* sel_ptr =
-      reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
-    *selected = sel_ptr->SelectOutput(nodeID, output_nodeID);
-  }
+                                    int output_nodeID, int* selected);
 
   /*! \brief returns status of calling filter function from library */
   MX_VOID_RET _partCallFilter(void* sel_inst, int* candidates, int num_candidates,
-                              int** keep, int* num_keep) {
-    mxnet::ext::CustomOpSelector* sel_ptr =
-      reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
-    std::vector<int> candidates_(num_candidates);
-    for (int i=0; i < num_candidates; i++) {
-      candidates_[i] = candidates[i];
-    }
-    std::vector<int> keep_;
-
-    sel_ptr->Filter(candidates_, &keep_);
-
-    *num_keep = keep_.size();
-    *keep = static_cast<int*>(malloc(keep_.size() * sizeof(int)));
-    for (unsigned i=0; i < keep_.size(); i++)
-      (*keep)[i] = keep_[i];
-  }
+                              int** keep, int* num_keep);
 
   /*! \brief returns status of calling reset selector function from library */
-  MX_VOID_RET _partCallReset(void* sel_inst) {
-    mxnet::ext::CustomOpSelector* sel_ptr =
-      reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
-    sel_ptr->Reset();
-  }
+  MX_VOID_RET _partCallReset(void* sel_inst);
 
   /*! \brief returns status of calling review subgraph function from library */
   MX_INT_RET _partCallReviewSubgraph(mxnet::ext::reviewSubgraph_t reviewSubgraph, const char *json,
@@ -2289,79 +1266,14 @@ extern "C" {
                                      void* const* aux_data, const int64_t* const* aux_shapes,
                                      const int* aux_dims, const int* aux_types,
                                      const size_t* aux_IDs, const char* const* aux_dev_type,
-                                     const int* aux_dev_id) {
-    mxnet::ext::Graph *subgraph = mxnet::ext::Graph::fromString(json);
-    bool accept_bool = false;
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> opts;
-    for (int i = 0; i < num_opts; i++)
-      opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
-
-    // create a map of named tensors for args
-    std::unordered_map<std::string, mxnet::ext::MXTensor> args;
-    for (int i = 0; i < num_args; i++) {
-      std::vector<int64_t> shapes;
-      for (int j = 0; j < arg_dims[i]; j++)
-        shapes.push_back(arg_shapes[i][j]);
-
-      mxnet::ext::MXTensor tensor(arg_data[i], shapes, (mxnet::ext::MXDType)arg_types[i],
-                      arg_IDs[i], mxnet::ext::MXContext(arg_dev_type[i], arg_dev_id[i]));
-      args[arg_names[i]] = tensor;
-    }
-    // create a map of named tensors for aux
-    std::unordered_map<std::string, mxnet::ext::MXTensor> aux;
-    for (int i = 0; i < num_aux; i++) {
-      std::vector<int64_t> shapes;
-      for (int j = 0; j < aux_dims[i]; j++)
-        shapes.push_back(aux_shapes[i][j]);
-
-      mxnet::ext::MXTensor tensor(aux_data[i], shapes, (mxnet::ext::MXDType)aux_types[i],
-                                  aux_IDs[i], mxnet::ext::MXContext(aux_dev_type[i],
-                                                                    aux_dev_id[i]));
-      aux[aux_names[i]] = tensor;
-    }
-
-    subgraph->_setParams(&args, &aux);
-    mxnet::ext::MXReturnValue retval = reviewSubgraph(subgraph, subgraph_id, &accept_bool,
-                                                      opts);
-    if (!retval) return retval;
-
-    *accept = accept_bool;
-
-    if (subgraph->attrs.size() > 0) {
-      *num_attrs = subgraph->attrs.size();
-      // allocate space for attributes
-      *attr_keys = static_cast<char**>(malloc (*num_attrs * sizeof(char*)));
-      *attr_vals = static_cast<char**>(malloc (*num_attrs * sizeof(char*)));
-
-      // copy attributes
-      int i = 0;
-      for (auto kv : subgraph->attrs) {
-        (*attr_keys)[i] = static_cast<char*>(malloc ((kv.first.size()+1) * sizeof(char)));
-        std::string val = kv.second.dump();  // convert JsonVal back to string
-        (*attr_vals)[i] = static_cast<char*>(malloc ((val.size()+1) * sizeof(char)));
-        snprintf((*attr_keys)[i], kv.first.size()+1, "%s", kv.first.c_str());
-        snprintf((*attr_vals)[i], val.size()+1, "%s", val.c_str());
-        i++;
-      }
-    }
-
-    return retval;
-  }
+                                     const int* aux_dev_id);
 
   /*! \brief returns number of graph passes registered in this library */
-  MX_INT_RET _passRegSize() {
-    return mxnet::ext::Registry<mxnet::ext::CustomPass>::get()->size();
-  }
+  MX_INT_RET _passRegSize();
 
   /*! \brief returns pass registration at specified index */
   MX_VOID_RET _passRegGet(int pass_idx, mxnet::ext::graphPass_t* graphPass,
-                          const char** pass_name) {
-    mxnet::ext::CustomPass pass =
-      mxnet::ext::Registry<mxnet::ext::CustomPass>::get()->get(pass_idx);
-    *graphPass = pass.pass;
-    *pass_name = pass.name;
-  }
+                          const char** pass_name);
 
   /*! \brief returns status of calling graph pass function from library */
   MX_INT_RET _passCallGraphPass(mxnet::ext::graphPass_t graphPass, const char *json,
@@ -2376,49 +1288,7 @@ extern "C" {
                                 const int* aux_dims, const int* aux_types,
                                 const size_t* aux_IDs, const char* const* aux_dev_type,
                                 const int* aux_dev_id, mxnet::ext::nd_malloc_t nd_malloc,
-                                const void* nd_alloc) {
-    mxnet::ext::Graph *graph = mxnet::ext::Graph::fromString(json);
-    // create map of attributes from list
-    std::unordered_map<std::string, std::string> opts;
-    for (int i = 0; i < num_opts; i++)
-      opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
-
-    // create a map of named tensors for args
-    std::unordered_map<std::string, mxnet::ext::MXTensor> args;
-    for (int i = 0; i < num_args; i++) {
-      std::vector<int64_t> shapes;
-      for (int j = 0; j < arg_dims[i]; j++)
-        shapes.push_back(arg_shapes[i][j]);
-
-      mxnet::ext::MXTensor tensor(arg_data[i], shapes, (mxnet::ext::MXDType)arg_types[i],
-                                  arg_IDs[i], mxnet::ext::MXContext(arg_dev_type[i],
-                                                                    arg_dev_id[i]));
-      args[arg_names[i]] = tensor;
-    }
-    // create a map of named tensors for aux
-    std::unordered_map<std::string, mxnet::ext::MXTensor> aux;
-    for (int i = 0; i < num_aux; i++) {
-      std::vector<int64_t> shapes;
-      for (int j = 0; j < aux_dims[i]; j++)
-        shapes.push_back(aux_shapes[i][j]);
-
-      mxnet::ext::MXTensor tensor(aux_data[i], shapes, (mxnet::ext::MXDType)aux_types[i],
-                                  aux_IDs[i], mxnet::ext::MXContext(aux_dev_type[i],
-                                                                    aux_dev_id[i]));
-      aux[aux_names[i]] = tensor;
-    }
-
-    std::unordered_map<std::string, mxnet::ext::MXTensor> new_args, new_aux;
-    mxnet::ext::PassResource res(&new_args, &new_aux, nd_malloc, nd_alloc);
-    graph->_setParams(&args, &aux);
-    graph->_setPassResource(&res);
-    mxnet::ext::MXReturnValue retval = graphPass(graph, opts);
-    if (!retval) return retval;
-
-    std::string *tmp = new std::string(graph->toString());
-    *out_graph = const_cast<char*>(tmp->c_str());
-    return retval;
-  }
+                                const void* nd_alloc);
 
   /*!
    * \brief Checks if the MXNet version is supported by the library.
@@ -2434,13 +1304,10 @@ extern "C" {
 #endif
   initialize(int version);
 
-  MX_INT_RET _msgSize() {
-    return mxnet::ext::MXerrorMsgs::get()->size();
-  }
+  MX_INT_RET _msgSize();
 
   /*! \brief returns operator registration at specified index */
-  MX_VOID_RET _msgGet(int idx, const char** msg) {
-    *msg = mxnet::ext::MXerrorMsgs::get()->get(idx)->c_str();
-  }
+  MX_VOID_RET _msgGet(int idx, const char** msg);
 }  // extern "C"
+
 #endif  // MXNET_LIB_API_H_
diff --git a/src/lib_api.cc b/src/lib_api.cc
new file mode 100644
index 000000000000..f46c36a32d64
--- /dev/null
+++ b/src/lib_api.cc
@@ -0,0 +1,1593 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file lib_api.cc
+ * \brief APIs to interact with libraries
+ * This API specifies function prototypes to
+ * register custom ops, partitioner, and passes
+ * for library authors
+ * See example/extension/lib_custom_op/README.md
+ * See example/extension/lib_subgraph/README.md
+ * See example/extension/lib_pass/README.md
+ */
+
+#include <mxnet/lib_api.h>
+
+mxnet::ext::MXerrorMsgs* mxnet::ext::MXerrorMsgs::get() {
+    static MXerrorMsgs inst;
+    return &inst;
+  }
+
+std::stringstream& mxnet::ext::MXerrorMsgs::add(const char* file, int line) {
+  messages.emplace_back();
+  messages.back() << file << "[" << line << "]: ";
+  return messages.back();
+}
+
+int mxnet::ext::MXerrorMsgs::size() {
+  return messages.size();
+}
+
+const std::string* mxnet::ext::MXerrorMsgs::get(int idx) {
+  return new std::string(messages.at(idx).str());
+}
+
+mxnet::ext::MXContext::MXContext() : dev_type("error"), dev_id(-1) {}
+
+mxnet::ext::MXContext::MXContext(std::string dev_type_, int dev_id_)
+  : dev_type(std::move(dev_type_)), dev_id(dev_id_) {}
+
+mxnet::ext::MXContext::MXContext(const char* dev_type_, int dev_id_)
+  : dev_type(dev_type_), dev_id(dev_id_) {}
+
+mxnet::ext::MXContext mxnet::ext::MXContext::CPU() { return MXContext("cpu", 0); }
+
+mxnet::ext::MXContext mxnet::ext::MXContext::GPU() { return MXContext("gpu", 0); }
+
+mxnet::ext::MXContext mxnet::ext::MXContext::CPU(int dev_id) { return MXContext("cpu", dev_id); }
+
+mxnet::ext::MXContext mxnet::ext::MXContext::GPU(int dev_id) { return MXContext("gpu", dev_id); }
+
+void mxnet::ext::MXSparse::set(void *data_ptr, const int64_t* dims, int ndims, void *idx,
+                               int64_t num_idx, void *idx_ptr, int64_t num_idx_ptr) {
+  data = data_ptr;
+  // If CSR, num of non-zero elemets is num_idx,
+  // If row sparse, num of elements is num_idx * width.
+  data_len = num_idx;
+  if (!idx_ptr) {
+    for (int i = 1; i < ndims; ++i)
+      data_len *= dims[i];
+  }
+
+  indices = reinterpret_cast<int64_t*>(idx);
+  indices_len = num_idx;
+
+  if (idx_ptr) {
+    indptr = reinterpret_cast<int64_t*>(idx_ptr);
+    indptr_len = num_idx_ptr;
+  }
+}
+
+mxnet::ext::MXTensor::MXTensor() : data_ptr(nullptr), dtype(kUNSET), verID(0),
+                                   stype(kDefaultStorage) {}
+mxnet::ext::MXTensor::MXTensor(const MXTensor& oth) : data_ptr(oth.data_ptr), shape(oth.shape),
+                                                      dtype(oth.dtype), verID(oth.verID),
+                                                      ctx(oth.ctx), stype(oth.stype) {
+  setDLTensor();
+}
+
+mxnet::ext::MXTensor::MXTensor(void *data_ptr, std::vector<int64_t> shape, MXDType dtype,
+                               size_t vID, MXContext mx_ctx, MXStorageType stype)
+  : data_ptr(data_ptr), shape(std::move(shape)), dtype(dtype), verID(vID), ctx(std::move(mx_ctx)),
+    stype(stype) {
+  setDLTensor();
+}
+
+void mxnet::ext::MXTensor::setTensor(void *dptr, MXDType type, const int64_t* dims, int ndims,
+                                     size_t vID, MXContext mx_ctx, MXStorageType storage_type) {
+  data_ptr = dptr; dtype = type; verID = vID; ctx = mx_ctx; stype = storage_type;
+  shape.clear();
+  for (int j = 0; j < ndims; j++) {
+    shape.push_back(dims[j]);
+  }
+  setDLTensor();
+}
+
+void mxnet::ext::MXTensor::setDLTensor() {
+  dltensor.data = data_ptr;
+  dltensor.ndim = shape.size();
+  dltensor.shape = const_cast<int64_t*>(shape.data());
+  dltensor.strides = nullptr;
+  dltensor.byte_offset = 0;
+  dltensor.dtype.lanes = 1;
+  dltensor.ctx.device_id = ctx.dev_id;
+  if (ctx.dev_type == "cpu")
+    dltensor.ctx.device_type = kDLCPU;
+  else if (ctx.dev_type == "gpu")
+    dltensor.ctx.device_type = kDLGPU;
+  else if (ctx.dev_type == "opencl")
+    dltensor.ctx.device_type = kDLOpenCL;
+  else if (ctx.dev_type == "vulcan")
+    dltensor.ctx.device_type = kDLVulkan;
+  else if (ctx.dev_type == "metal")
+    dltensor.ctx.device_type = kDLMetal;
+  else if (ctx.dev_type == "vpi")
+    dltensor.ctx.device_type = kDLVPI;
+  else if (ctx.dev_type == "rocm")
+    dltensor.ctx.device_type = kDLROCM;
+  else
+    dltensor.ctx.device_type = kDLExtDev;
+  switch (dtype) {
+  case kFloat32:
+    dltensor.dtype.code = kDLFloat;
+    dltensor.dtype.bits = 32;
+    break;
+  case kFloat64:
+    dltensor.dtype.code = kDLFloat;
+    dltensor.dtype.bits = 64;
+    break;
+  case kFloat16:
+    dltensor.dtype.code = kDLFloat;
+    dltensor.dtype.bits = 16;
+    break;
+  case kUint8:
+    dltensor.dtype.code = kDLUInt;
+    dltensor.dtype.bits = 8;
+    break;
+  case kInt32:
+    dltensor.dtype.code = kDLInt;
+    dltensor.dtype.bits = 32;
+    break;
+  case kInt8:
+    dltensor.dtype.code = kDLInt;
+    dltensor.dtype.bits = 8;
+    break;
+  case kInt64:
+    dltensor.dtype.code = kDLInt;
+    dltensor.dtype.bits = 64;
+    break;
+  default:
+    dltensor.dtype.code = 0;
+    dltensor.dtype.bits = 0;
+    throw std::runtime_error("Error! Invalid dtype flag: "
+                             + std::to_string(static_cast<int>(dtype))
+                             + " when constructing MXTensor");
+  }
+}
+
+int64_t mxnet::ext::MXTensor::size() const {
+  int64_t size = 1;
+  for (auto &s : shape)
+    size *= s;
+  return size;
+}
+
+bool mxnet::ext::MXTensor::isSame(const MXTensor &oth) const {
+  return data_ptr == oth.data_ptr &&
+    dtype == oth.dtype &&
+    verID == oth.verID &&
+    ctx.dev_type == oth.ctx.dev_type &&
+    ctx.dev_id == oth.ctx.dev_id &&
+    shape == oth.shape &&
+    stype == oth.stype;
+}
+
+mxnet::ext::PassResource::PassResource(std::unordered_map<std::string, MXTensor>* new_args,
+                                       std::unordered_map<std::string, MXTensor>* new_aux,
+                                       nd_malloc_t nd_malloc, const void* nd_alloc)
+  : new_args_(new_args), new_aux_(new_aux), nd_malloc_(nd_malloc), nd_alloc_(nd_alloc) {}
+
+mxnet::ext::MXTensor* mxnet::ext::PassResource::alloc_arg(const std::string& name,
+                                                          const std::vector<int64_t>& shapes,
+                                                          const mxnet::ext::MXContext &ctx,
+                                                          mxnet::ext::MXDType dtype) const {
+  void* data;
+  nd_malloc_(nd_alloc_, shapes.data(), shapes.size(), ctx.dev_type.c_str(), ctx.dev_id,
+             dtype, name.c_str(), 1, &data);
+  MXTensor tensor(data, shapes, dtype, 0, ctx, kDefaultStorage);
+  (*new_args_)[name] = tensor;
+  return &(new_args_->at(name));
+}
+
+mxnet::ext::MXTensor* mxnet::ext::PassResource::alloc_aux(const std::string& name,
+                                                          const std::vector<int64_t>& shapes,
+                                                          const mxnet::ext::MXContext &ctx,
+                                                          mxnet::ext::MXDType dtype) const {
+  void* data;
+  nd_malloc_(nd_alloc_, shapes.data(), shapes.size(), ctx.dev_type.c_str(), ctx.dev_id,
+             dtype, name.c_str(), 0, &data);
+  MXTensor tensor(data, shapes, dtype, 0, ctx, kDefaultStorage);
+  (*new_aux_)[name] = tensor;
+  return &(new_aux_->at(name));
+}
+
+mxnet::ext::OpResource::OpResource(xpu_malloc_t cpu_malloc_fp, void* cpu_alloc_fp,
+                                   xpu_malloc_t gpu_malloc_fp, void* gpu_alloc_fp, void* stream,
+                                   sparse_malloc_t sparse_malloc_fp, void* sparse_alloc_fp,
+                                   void* rng_cpu_states, void* rng_gpu_states)
+  : cpu_malloc(cpu_malloc_fp), gpu_malloc(gpu_malloc_fp),
+    cpu_alloc(cpu_alloc_fp), gpu_alloc(gpu_alloc_fp), cuda_stream(stream),
+    sparse_malloc(sparse_malloc_fp), sparse_alloc(sparse_alloc_fp),
+    rand_cpu_states(rng_cpu_states), rand_gpu_states(rng_gpu_states) {}
+
+void* mxnet::ext::OpResource::alloc_cpu(int size) const {
+  return cpu_malloc(cpu_alloc, size);
+}
+
+void* mxnet::ext::OpResource::alloc_gpu(int size) const {
+  return gpu_malloc(gpu_alloc, size);
+}
+
+void mxnet::ext::OpResource::alloc_sparse(mxnet::ext::MXSparse* sparse, int index,
+                                          int indices_len, int indptr_len) const {
+  sparse_malloc(sparse_alloc, index, indices_len, indptr_len,
+                &(sparse->data), &(sparse->indices), &(sparse->indptr));
+}
+
+mxnet::ext::mx_cpu_rand_t* mxnet::ext::OpResource::get_cpu_rand_states() const {
+  return static_cast<mx_cpu_rand_t*>(rand_cpu_states);
+}
+
+std::string mxnet::ext::getShapeAt(const std::string& shape, unsigned index) {
+  int idx = 1;  // start at 1 to skip the first square bracket [
+  // find the beginning of the output shape for the particular output index
+  for (unsigned x=0; x < index; x++)
+    idx = shape.find("[", idx+1);
+  int stop = shape.find("]", idx);  // find stop index for this output shape
+  // add this shape to the list
+  return shape.substr(idx, stop-idx+1);
+}
+
+std::string mxnet::ext::getDtypeAt(const std::string& dtype, unsigned index) {
+  // find the beginning of the output dtype for the particular output index
+  int idx = 0;
+  for (unsigned x=0; x < index; x++)
+    idx = dtype.find(",", idx+1);
+  int stop = dtype.find(",", idx+1);  // find stop index for this output dtype
+  if (stop == -1) stop = dtype.find("]", idx+1);
+  return dtype.substr(idx+1, stop-idx-1);
+}
+
+mxnet::ext::JsonVal::JsonVal() : type(ERR), num(-1), str("") {}
+mxnet::ext::JsonVal::JsonVal(mxnet::ext::JsonType t) : type(t), num(-1), str("") {}
+mxnet::ext::JsonVal::JsonVal(std::string s) : type(STR), num(-1), str(std::move(s)) {}
+mxnet::ext::JsonVal::JsonVal(int n) : type(NUM), num(n), str(std::to_string(n)) {}
+mxnet::ext::JsonVal::JsonVal(JsonType t, int n, std::string s) : type(t), num(n),
+                                                                 str(std::move(s)) {}
+
+bool mxnet::ext::JsonVal::operator<(const mxnet::ext::JsonVal &o) const {
+  // for string JSON objects compare the string
+  if (type == STR) return type == o.type && str < o.str;
+  // for number JSON objects compare the number
+  if (type == NUM) return type == o.type && num < o.num;
+  // for list JSON objects, compare the size of list, and then each object in the list
+  if (type == LIST) {
+    if (list.size() != o.list.size()) return false;
+    for (unsigned int i=0; i< list.size(); i++)
+      if (list[i] < o.list[i])
+        return false;  // if we find an object that doesnt match return
+    return true;  // all objects in lists matched
+  }
+  // for map JSON objects, compare the size of map, and then each key/value in the maps
+  if (type == MAP) {
+    if (map.size() != o.map.size()) return false;
+    for (auto &item : map) {
+      // if one map is missing a key in another return
+      if (o.map.find(item.first) == o.map.end()) return false;
+      if (item.second < o.map.at(item.first)) return false;
+    }
+    return true;
+  }
+  return type < o.type;
+}
+
+std::string mxnet::ext::JsonVal::dump() const {
+  std::string ret;
+  switch (type) {
+  case ERR:
+    ret = "json(Error)";
+    break;
+  case STR:
+    ret = "\"" + str + "\"";
+    break;
+  case NUM:
+    ret = str;
+    break;
+  case LIST:
+    ret = "[";
+    for (unsigned i=0; i < list.size(); i++) {
+      auto &item = list[i];
+      ret += item.dump();
+      if (i < list.size()-1)
+        ret += ",";
+    }
+    ret += "]";
+    break;
+  case MAP:
+    ret = "{";
+    unsigned cnt = 0;
+    for (auto &item : map) {
+      ret += item.first.dump() + " : " + item.second.dump();
+      if (cnt++ < map.size()-1)
+        ret += ",";
+    }
+    ret += "}";
+    break;
+  }
+  return ret;
+}
+
+mxnet::ext::JsonVal mxnet::ext::JsonVal::parse(const std::string& json) {
+  unsigned int idx = 0;
+  return JsonVal::parse(json, &idx);
+}
+
+mxnet::ext::JsonVal mxnet::ext::JsonVal::parse_string(const std::string& json, unsigned int* idx) {
+  JsonVal ret(STR);
+  while (*idx < json.size()) {
+    if (json[*idx] == '"') {
+      ++(*idx);
+      return ret;
+    } else {
+      ret.str += json[*idx];
+      ++(*idx);
+    }
+  }
+  MX_ERROR_MSG << "Error! Unable to parse string: '" << json.substr(*idx) << "'" << std::endl;
+  return JsonVal();
+}
+
+mxnet::ext::JsonVal mxnet::ext::JsonVal::parse_num(const std::string& json, unsigned int* idx) {
+  JsonVal ret(NUM);
+  while (*idx < json.size()) {
+    if (json[*idx] >= '0' && json[*idx] <= '9') {
+      ret.str += json[*idx];
+      ++(*idx);
+    } else {
+      break;
+    }
+  }
+  ret.num = std::stoi(ret.str);
+  return ret;
+}
+
+mxnet::ext::JsonVal mxnet::ext::JsonVal::parse_list(const std::string& json, unsigned int* idx) {
+  JsonVal ret(LIST);
+  while (*idx < json.size()) {
+    if (json[*idx] == ']') {
+      ++(*idx);
+      return ret;
+    } else {
+      JsonVal item = JsonVal::parse(json, idx);
+      if (item.type != ERR)
+        ret.list.push_back(item);
+    }
+  }
+  MX_ERROR_MSG << "Error! Unable to parse list: '" << json.substr(*idx) << "'" << std::endl;
+  return JsonVal();
+}
+
+mxnet::ext::JsonVal mxnet::ext::JsonVal::parse_map(const std::string& json, unsigned int* idx) {
+  JsonVal ret(MAP), key;
+  while (*idx < json.size()) {
+    if (json[*idx] == '}') {
+      ++(*idx);
+      return ret;
+    } else {
+      JsonVal item = JsonVal::parse(json, idx);
+      if (key.type == ERR) {
+        key = item;
+      } else {
+        ret.map[key] = item;
+        key.type = ERR;
+      }
+    }
+  }
+  MX_ERROR_MSG << "Error! Unable to parse map: '" << json.substr(*idx) << "'" << std::endl;
+  return mxnet::ext::JsonVal();
+}
+
+mxnet::ext::JsonVal mxnet::ext::JsonVal::parse(const std::string& json, unsigned int *idx) {
+  JsonVal ret;
+  while (*idx < json.size()) {
+    if (json[*idx] == '"') {
+      ++(*idx);
+      ret = JsonVal::parse_string(json, idx);
+    } else if (json[*idx] >= '0' && json[*idx] <= '9') {
+      ret = JsonVal::parse_num(json, idx);
+    } else if (json[*idx] == '[') {
+      ++(*idx);
+      ret = JsonVal::parse_list(json, idx);
+    } else if (json[*idx] == '{') {
+      ++(*idx);
+      ret = JsonVal::parse_map(json, idx);
+    } else if (json[*idx] == ']' || json[*idx] == '}') {return ret;}
+    if (ret.type != ERR) return ret;
+    ++(*idx);
+  }
+  return ret;
+}
+
+std::string mxnet::ext::JsonVal::toString() const {
+  std::string ret;
+  switch (type) {
+  case ERR:
+    ret = "json(Error)";
+    break;
+  case STR:
+    ret = "json(STR:" + str + ")";
+    break;
+  case NUM:
+    ret = "json(INT:" + str + ")";
+    break;
+  case LIST:
+    ret = "json(LIST:[";
+    for (auto &item : list)
+      ret += item.toString() + ",";
+    ret += "])";
+    break;
+  case MAP:
+    ret = "json(MAP:{";
+    for (auto &item : map)
+      ret += item.first.toString() + " : " + item.second.toString() + ",";
+    ret += "})";
+    break;
+  }
+  return ret;
+}
+
+mxnet::ext::Node::Node() {tensor = nullptr;}
+
+void mxnet::ext::Node::_setPassResource(mxnet::ext::PassResource* res_) {res = res_;}
+
+void mxnet::ext::Node::alloc_arg(const std::vector<int64_t>& shapes,
+                                 const mxnet::ext::MXContext &ctx, mxnet::ext::MXDType dtype) {
+  if (!res)
+    throw std::runtime_error("Node not initialized. Cannot use alloc_arg outside of graph passes.");
+  tensor = res->alloc_arg(name, shapes, ctx, dtype);
+}
+
+void mxnet::ext::Node::alloc_aux(const std::vector<int64_t>& shapes,
+                                 const mxnet::ext::MXContext &ctx, mxnet::ext::MXDType dtype) {
+  if (!res)
+    throw std::runtime_error("Node not initialized. Cannot use alloc_aux outside of graph passes.");
+  tensor = res->alloc_aux(name, shapes, ctx, dtype);
+}
+
+mxnet::ext::Graph::Graph() : res(nullptr) {}
+
+mxnet::ext::Graph::~Graph() {
+  for (auto &node : nodes)
+    delete node;
+}
+
+mxnet::ext::Graph* mxnet::ext::Graph::fromString(const std::string& json) {
+  JsonVal val = JsonVal::parse(json);
+  return fromJson(val);
+}
+
+mxnet::ext::Graph* mxnet::ext::Graph::fromJson(mxnet::ext::JsonVal val) {
+  // get nodes list
+  JsonVal nodes = val.map[JsonVal("nodes")];
+  Graph *g = new Graph();
+
+  std::map<int, Node*> nodeMap;
+  // loop over nodes
+  for (int i = 0; i < nodes.list.size(); i++) {
+    Node* n = new Node();
+    g->nodes.push_back(n);
+    JsonVal node = nodes.list[i];
+
+    // set the op info
+    n->op = node.map[JsonVal("op")].str;
+    n->name = node.map[JsonVal("name")].str;
+
+    // if op is null it is an input to the graph
+    if (n->op.compare("null") == 0)
+      g->inputs.push_back(n);
+
+    // set attrs
+    JsonVal attributes = node.map[JsonVal("attrs")];
+    for (auto& kv : attributes.map) {
+      n->attrs[kv.first.str] = kv.second.str;
+    }
+
+    // set subgraphs, parsing each into a graph
+    if (node.map.count(JsonVal("subgraphs")) > 0) {
+      JsonVal subgraphs = node.map[JsonVal("subgraphs")];
+      for (auto &subgraph : subgraphs.list) {
+        n->subgraphs.push_back(fromJson(subgraph));
+      }
+    }
+
+    // set node inputs
+    JsonVal node_inputs = node.map[JsonVal("inputs")];
+    n->inputs.resize(node_inputs.list.size());
+    for (int j = 0; j < node_inputs.list.size(); j++) {
+      JsonVal input = node_inputs.list[j];
+      NodeEntry& entry = n->inputs[j];
+      // get pointer to other node
+      entry.node = nodeMap[input.list[0].num];
+      // get the other node's output index
+      entry.entry = input.list[1].num;
+      // set other nodes output as connected to this node
+      entry.node->outputs.push_back({n, j});
+    }
+    nodeMap[i] = n;
+  }
+
+  // set graph level outputs
+  JsonVal& heads = val.map[JsonVal("heads")];
+  g->outputs.resize(heads.list.size());
+  for (int i = 0; i < heads.list.size(); i++) {
+    JsonVal head = heads.list[i];
+    g->outputs[i].node = nodeMap[head.list[0].num];
+    g->outputs[i].entry = head.list[1].num;
+  }
+
+  // add all attributes to the graph
+  for (auto& kv : val.map) {
+    if (kv.first.str.compare("nodes") != 0 &&
+        kv.first.str.compare("heads") != 0 &&
+        kv.first.str.compare("node_row_ptr") != 0 &&
+        kv.first.str.compare("arg_nodes") != 0) {
+      g->attrs[kv.first.str] = kv.second;
+    }
+  }
+  return g;
+}
+
+/* \brief convert graph object back to JSON object */
+mxnet::ext::JsonVal mxnet::ext::Graph::toJson() {
+  // top level object is a map
+  JsonVal val(MAP);
+
+  // add attributes
+  for (auto& kv : attrs) {
+    val.map[JsonVal(kv.first)] = kv.second;
+  }
+
+  // sort graph nodes in topological order, create mapping of node to index
+  std::map<Node*, int> nodeMap;
+  std::vector<Node*> sorted = topological_sort();
+  // nodes are in reverse topological order in the vector (back is first)
+  // so loop from end to front over the vector 'sorted'
+  for (int i = sorted.size()-1; i >= 0; i--) {
+    nodeMap[sorted[i]] = sorted.size()-1-i;
+  }
+
+  // create node_row_ptr entry
+  val.map[JsonVal("node_row_ptr")] = JsonVal(LIST);
+  JsonVal& node_row_ptr = val.map[JsonVal("node_row_ptr")];
+  for (int i = 0; i < nodes.size(); i++)
+    node_row_ptr.list.emplace_back(i);
+
+  // add all input nodes
+  val.map[JsonVal("arg_nodes")] = JsonVal(LIST);
+  JsonVal& arg_nodes = val.map[JsonVal("arg_nodes")];
+  for (auto &input : inputs)
+    arg_nodes.list.emplace_back(nodeMap[input]);
+
+  // add all output nodes
+  val.map[JsonVal("heads")] = JsonVal(LIST);
+  JsonVal& heads = val.map[JsonVal("heads")];
+  for (int i = 0; i < outputs.size(); i++) {
+    heads.list.emplace_back(LIST);
+    JsonVal& out = heads.list[i];
+    out.list.emplace_back(nodeMap[outputs[i].node]);
+    out.list.emplace_back(outputs[i].entry);
+    out.list.emplace_back(0);
+  }
+
+  // add all graph nodes
+  val.map[JsonVal("nodes")] = JsonVal(LIST);
+  JsonVal& nodes_ = val.map[JsonVal("nodes")];
+  for (int i = sorted.size()-1; i >= 0; i--) {
+    // each node is a map
+    nodes_.list.emplace_back(MAP);
+    Node* n = sorted[i];
+    JsonVal& n_ = nodes_.list[nodes_.list.size()-1];
+
+    n_.map[JsonVal("op")] = JsonVal(n->op);
+    n_.map[JsonVal("name")] = JsonVal(n->name);
+    n_.map[JsonVal("inputs")] = JsonVal(LIST);
+
+    // add inputs for this node
+    JsonVal& inputs_ = n_.map[JsonVal("inputs")];
+    for (int j = 0; j < n->inputs.size(); j++) {
+      inputs_.list.emplace_back(LIST);
+      NodeEntry& entry = n->inputs[j];
+      JsonVal& in = inputs_.list[j];
+      in.list.emplace_back(nodeMap[entry.node]);
+      in.list.emplace_back(entry.entry);
+      in.list.emplace_back(0);
+    }
+
+    // add subgraphs for this node, convert each back to JSON
+    if (n->subgraphs.size() > 0) {
+      n_.map[JsonVal("subgraphs")] = JsonVal(LIST);
+      JsonVal &subgraphs_ = n_.map[JsonVal("subgraphs")];
+      for (Graph *subgraph : n->subgraphs) {
+        subgraphs_.list.push_back(subgraph->toJson());
+      }
+    }
+
+    // add attributes for this node
+    n_.map[JsonVal("attrs")] = JsonVal(MAP);
+    JsonVal& attrs_ = n_.map[JsonVal("attrs")];
+    for (auto& kv : n->attrs) {
+      attrs_.map[JsonVal(kv.first)] = JsonVal(kv.second);
+    }
+  }
+  return val;
+}
+
+/* \brief convert graph object to JSON string */
+std::string mxnet::ext::Graph::toString() {
+  return toJson().dump();
+}
+
+  /* \brief visits a node "n" */
+void mxnet::ext::Graph::_dfs_util(Node* n, std::unordered_set<mxnet::ext::Node*>* to_visit,
+                                  std::function<void(mxnet::ext::Node*)> handler) const {
+  to_visit->erase(n);  // remove node now that we're visiting it
+  for (NodeEntry& e : n->outputs) {
+    Node* o = e.node;
+    if (to_visit->count(o) != 0) {
+      _dfs_util(o, to_visit, handler);  // visit neighbor
+    }
+  }
+  handler(n);  // post-order visit this node
+}
+
+/* \brief post-order DFS graph traversal */
+void mxnet::ext::Graph::DFS(std::function<void(Node*)> handler) const {
+  std::unordered_set<Node*> to_visit;
+  // put all nodes in set to visit
+  for (auto& n : nodes)
+    to_visit.insert(n);
+  // visit all inputs first
+  for (auto& i : inputs)
+    if (to_visit.count(i) != 0)
+      _dfs_util(i, &to_visit, handler);
+  // visit any nodes left
+  while (to_visit.size() > 0)
+    _dfs_util(*(to_visit.begin()), &to_visit, handler);
+}
+
+/* \brief sort graph nodes in topological order */
+std::vector<mxnet::ext::Node*> mxnet::ext::Graph::topological_sort() const {
+  std::vector<mxnet::ext::Node*> sorted;
+  auto handler = [&](mxnet::ext::Node* n) {
+    sorted.push_back(n);  // when visiting each node, add it in order to the vector
+  };
+  DFS(handler);
+  return sorted;
+}
+
+/* \brief print out graph details */
+void mxnet::ext::Graph::print(int indent) const {
+  std::string space = "";
+  for (int i = 0; i < indent; i++) space+=" ";
+
+  std::cout << space << "########### Graph #############" << std::endl;
+  std::cout << space << "attributes: " << std::endl;
+  for (auto &kv : attrs)
+    std::cout << space << "\t" << kv.first << " : " << kv.second.str << std::endl;
+  std::cout << space << "inputs: " << inputs.size() << std::endl;
+  std::cout << space << "outputs: " << outputs.size() << std::endl;
+  std::cout << space << "nodes: " << nodes.size() << std::endl;
+  std::vector<mxnet::ext::Node*> sorted = topological_sort();
+  // loop over each node and print out its inputs/outputs
+  for (int i = sorted.size()-1; i >= 0; i--) {
+    std::cout << space << "Node: " << sorted[i]->name << std::endl;
+    for (auto &input : sorted[i]->inputs) {
+      std::cout << space << "\tInput: " << input.node->name << " "
+                << input.entry << std::endl;
+    }
+    for (auto &output : sorted[i]->outputs) {
+      std::cout << space << "\tOutput: " << output.node->name << " "
+                << output.entry << std::endl;
+    }
+    if (sorted[i]->subgraphs.size() > 0) {
+      for (auto &subgraph : sorted[i]->subgraphs) {
+        std::cout << space << "\tSubgraph:" << std::endl;
+        subgraph->print(indent+2);
+      }
+    }
+  }
+  std::cout << space << "###############################" << std::endl;
+}
+
+/* \brief add a new node to this graph */
+mxnet::ext::Node* mxnet::ext::Graph::addNode(const std::string& name, const std::string& op) {
+  Node* n = new Node();
+  n->name = name;
+  n->op = op;
+  if (res)
+    n->_setPassResource(res);
+  return n;
+}
+
+/* \brief get node at index in graph */
+mxnet::ext::Node* mxnet::ext::Graph::getNode(size_t idx) {
+  return nodes[idx];
+}
+
+/* \brief get const node at index in const graph */
+const mxnet::ext::Node* mxnet::ext::Graph::getNode(size_t idx) const {
+  return nodes.at(idx);
+}
+
+/* \brief get attribute on graph */
+const mxnet::ext::JsonVal& mxnet::ext::Graph::getAttr(const std::string& key) const {
+  return attrs.at(key);
+}
+
+/* \brief get number of nodes in the graph */
+size_t mxnet::ext::Graph::size() const {
+  return nodes.size();
+}
+
+// internally set passResource to enable tensor allocation for graph passes
+void mxnet::ext::Graph::_setPassResource(PassResource* res_) {
+  res = res_;
+  // set passResource for each node
+  for (Node* node : nodes) {
+    node->_setPassResource(res);
+  }
+}
+
+// internally set arg/aux params when available
+void mxnet::ext::Graph::_setParams(std::unordered_map<std::string, mxnet::ext::MXTensor>* args,
+                                   std::unordered_map<std::string, mxnet::ext::MXTensor>* aux) {
+  // set params for each input node
+  for (Node* node : inputs) {
+    if (args->count(node->name) > 0)
+      node->tensor = &args->at(node->name);
+    else if (aux->count(node->name) > 0)
+      node->tensor = &aux->at(node->name);
+  }
+}
+
+mxnet::ext::CustomOp::CustomOp(const char* op_name)
+  : name(op_name), parse_attrs(nullptr), infer_type(nullptr), infer_storage_type(nullptr),
+    infer_shape(nullptr), mutate_inputs(nullptr), isSGop(false) {}
+
+mxnet::ext::CustomOp& mxnet::ext::CustomOp::setForward(mxnet::ext::fcomp_t fcomp, const char* ctx) {
+  if (forward_ctx_map.count(ctx) > 0)
+    raiseDuplicateContextError();
+  forward_ctx_map[ctx] = fcomp;
+  return *this;
+}
+
+mxnet::ext::CustomOp& mxnet::ext::CustomOp::setBackward(mxnet::ext::fcomp_t fgrad,
+                                                        const char* ctx) {
+  if (backward_ctx_map.count(ctx) > 0)
+    raiseDuplicateContextError();
+  backward_ctx_map[ctx] = fgrad;
+  return *this;
+}
+
+mxnet::ext::CustomOp& mxnet::ext::CustomOp::setParseAttrs(mxnet::ext::parseAttrs_t func) {
+  parse_attrs = func;
+  return *this;
+}
+
+mxnet::ext::CustomOp& mxnet::ext::CustomOp::setInferType(mxnet::ext::inferType_t func) {
+  infer_type = func;
+  return *this;
+}
+
+mxnet::ext::CustomOp& mxnet::ext::CustomOp::setInferSType(mxnet::ext::inferSType_t func) {
+  infer_storage_type = func;
+  return *this;
+}
+
+mxnet::ext::CustomOp& mxnet::ext::CustomOp::setInferShape(mxnet::ext::inferShape_t func) {
+  infer_shape = func;
+  return *this;
+}
+
+mxnet::ext::CustomOp& mxnet::ext::CustomOp::setMutateInputs(mxnet::ext::mutateInputs_t func) {
+  mutate_inputs = func;
+  return *this;
+}
+
+mxnet::ext::CustomOp& mxnet::ext::CustomOp::setCreateOpState(mxnet::ext::createOpState_t func,
+                                                             const char* ctx) {
+  if (create_op_ctx_map.count(ctx) > 0)
+    raiseDuplicateContextError();
+  create_op_ctx_map[ctx] = func;
+  return *this;
+}
+
+mxnet::ext::CustomOp& mxnet::ext::CustomOp::setIsSubgraphOp() {
+  isSGop = true;
+  return *this;
+}
+
+void mxnet::ext::CustomOp::mapToVector() {
+  for (auto kv : forward_ctx_map) {
+    forward_ctx_cstr.push_back(kv.first);
+    forward_fp.push_back(kv.second);
+  }
+  for (auto kv : backward_ctx_map) {
+    backward_ctx_cstr.push_back(kv.first);
+    backward_fp.push_back(kv.second);
+  }
+  for (auto kv : create_op_ctx_map) {
+    create_op_ctx_cstr.push_back(kv.first);
+    create_op_fp.push_back(kv.second);
+  }
+}
+
+void mxnet::ext::CustomOp::raiseDuplicateContextError() {
+  std::string op_name_str(name);
+  throw std::runtime_error(
+    "Error! Error! Cannot register multiple functions under same context for operator '"
+    + op_name_str + "'");
+}
+
+mxnet::ext::CustomPass::CustomPass() : name("ERROR") {}
+mxnet::ext::CustomPass::CustomPass(const char* pass_name)
+  : name(pass_name) {}
+mxnet::ext::CustomPass& mxnet::ext::CustomPass::setBody(graphPass_t fn) {
+  pass = fn;
+  return *this;
+}
+
+mxnet::ext::CustomPartitioner::CustomPartitioner() : name("ERROR") {}
+mxnet::ext::CustomPartitioner::CustomPartitioner(const char* backend_name) :
+  name(backend_name) {}
+
+mxnet::ext::CustomPartitioner& mxnet::ext::CustomPartitioner::addStrategy(const char* prop_name,
+                                                                          const char* sg_name) {
+  strategies.push_back(prop_name);
+  op_names.push_back(sg_name);
+  return *this;
+}
+
+mxnet::ext::CustomPartitioner& mxnet::ext::CustomPartitioner::setSupportedOps(const char* prop_name,
+                                                                    mxnet::ext::supportedOps_t fn) {
+  supported_map[std::string(prop_name)] = fn;
+  return *this;
+}
+
+mxnet::ext::CustomPartitioner& mxnet::ext::CustomPartitioner::setCreateSelector(
+                                           const char* prop_name, mxnet::ext::createSelector_t fn) {
+  selector_map[std::string(prop_name)] = fn;
+  return *this;
+}
+
+mxnet::ext::CustomPartitioner& mxnet::ext::CustomPartitioner::setReviewSubgraph(
+                                           const char* prop_name, mxnet::ext::reviewSubgraph_t fn) {
+  review_map[std::string(prop_name)] = fn;
+  return *this;
+}
+
+mxnet::ext::supportedOps_t mxnet::ext::CustomPartitioner::getSupportedOps(int stg_id) {
+  std::string prop(strategies[stg_id]);
+  if (supported_map.count(prop) > 0)
+    return supported_map[prop];
+  else
+    return nullptr;
+}
+
+mxnet::ext::createSelector_t mxnet::ext::CustomPartitioner::getCreateSelector(int stg_id) {
+  std::string prop(strategies[stg_id]);
+  if (selector_map.count(prop) > 0)
+    return selector_map[prop];
+  else
+    return nullptr;
+}
+
+mxnet::ext::reviewSubgraph_t mxnet::ext::CustomPartitioner::getReviewSubgraph(int stg_id) {
+  std::string prop(strategies[stg_id]);
+  if (review_map.count(prop) > 0)
+    return review_map[prop];
+  else
+    return nullptr;
+}
+
+/*! \brief returns MXNet library version */
+MX_INT_RET _opVersion() {
+  return MX_LIBRARY_VERSION;
+}
+
+/*! \brief returns number of ops registered in this library */
+MX_INT_RET _opRegSize() {
+  return mxnet::ext::Registry<mxnet::ext::CustomOp>::get()->size();
+}
+
+/*! \brief returns operator registration at specified index */
+MX_VOID_RET _opRegGet(int idx, const char** name, int *isSGop,
+                      const char*** forward_ctx, mxnet::ext::fcomp_t** forward_fp,
+                      int* forward_count, const char*** backward_ctx,
+                      mxnet::ext::fcomp_t** backward_fp, int* backward_count,
+                      const char*** create_op_ctx, mxnet::ext::createOpState_t** create_op_fp,
+                      int* create_op_count, mxnet::ext::parseAttrs_t* parse,
+                      mxnet::ext::inferType_t* type, mxnet::ext::inferSType_t* stype,
+                      mxnet::ext::inferShape_t* shape, mxnet::ext::mutateInputs_t* mutate) {
+  mxnet::ext::CustomOp &op = mxnet::ext::Registry<mxnet::ext::CustomOp>::get()->get(idx);
+  *name = op.name;
+  *parse = op.parse_attrs;
+  *type = op.infer_type;
+  *stype = op.infer_storage_type;
+  *shape = op.infer_shape;
+  *mutate = op.mutate_inputs;
+  *isSGop = op.isSGop;
+  op.mapToVector();
+  *forward_ctx = op.forward_ctx_cstr.data();
+  *forward_fp = op.forward_fp.data();
+  *forward_count = op.forward_fp.size();
+  *backward_ctx = op.backward_ctx_cstr.data();
+  *backward_fp = op.backward_fp.data();
+  *backward_count = op.backward_fp.size();
+  *create_op_ctx = op.create_op_ctx_cstr.data();
+  *create_op_fp = op.create_op_fp.data();
+  *create_op_count = op.create_op_fp.size();
+}
+
+/*! \brief calls free from the external library for library allocated arrays */
+MX_VOID_RET _opCallFree(void* ptr) {
+  free(ptr);
+}
+
+/*! \brief returns status of calling parse attributes function for operator from library */
+MX_INT_RET _opCallParseAttrs(mxnet::ext::parseAttrs_t parseAttrs, const char* const* keys,
+                             const char* const* vals, int num,
+                             int* num_in, int* num_out) {
+  // create map of attributes from list
+  std::unordered_map<std::string, std::string> attrs;
+  for (int i = 0; i < num; i++) {
+    attrs[std::string(keys[i])] = std::string(vals[i]);
+  }
+  return parseAttrs(attrs, num_in, num_out);
+}
+
+/*! \brief returns status of calling inferShape function for operator from library */
+MX_INT_RET _opCallInferShape(mxnet::ext::inferShape_t inferShape, const char* const* keys,
+                             const char* const* vals, int num,
+                             unsigned int** inshapes, int* indims, int num_in,
+                             unsigned int*** mod_inshapes, int** mod_indims,
+                             unsigned int*** outshapes, int** outdims, int num_out) {
+  // create map of attributes from list
+  std::unordered_map<std::string, std::string> attrs;
+  for (int i = 0; i < num; i++) {
+    attrs[std::string(keys[i])] = std::string(vals[i]);
+  }
+
+  // create a vector of shapes for inputs
+  std::vector<std::vector<unsigned int> > in_shapes(num_in);
+  for (int i = 0; i < num_in; i++) {
+    for (int j = 0; j < indims[i]; j++) {
+      in_shapes[i].push_back(inshapes[i][j]);
+    }
+  }
+
+  // create a vector of shapes for outputs
+  std::vector<std::vector<unsigned int> > out_shapes(num_out);
+
+  int retval = inferShape(attrs, &in_shapes, &out_shapes);
+  if (!retval) return retval;
+
+  // allocate space for modified input dims, shape
+  *mod_indims = static_cast<int*>(malloc (num_in * sizeof(int)));
+  *mod_inshapes = static_cast<unsigned**>(malloc (num_in * sizeof(unsigned*)));
+
+  // copy modified input shapes
+  for (int i = 0; i < num_in; i++) {
+    (*mod_indims)[i] = in_shapes[i].size();
+    (*mod_inshapes)[i] = static_cast<unsigned*>(malloc ((*mod_indims)[i] * sizeof(unsigned)));
+    for (int j = 0; j < (*mod_indims)[i]; j++) {
+      (*mod_inshapes)[i][j] = in_shapes[i][j];
+    }
+  }
+
+  // allocate space for output dims, shape
+  *outdims = static_cast<int*>(malloc (num_out * sizeof(int)));
+  *outshapes = static_cast<unsigned**>(malloc (num_out * sizeof(unsigned*)));
+
+  // copy output shapes
+  for (int i = 0; i < num_out; i++) {
+    (*outdims)[i] = out_shapes[i].size();
+    (*outshapes)[i] = static_cast<unsigned*>(malloc ((*outdims)[i] * sizeof(unsigned)));
+    for (int j = 0; j < (*outdims)[i]; j++) {
+      (*outshapes)[i][j] = out_shapes[i][j];
+    }
+  }
+  return retval;
+}
+
+/*! \brief returns status of calling inferType function for operator from library */
+MX_INT_RET _opCallInferType(mxnet::ext::inferType_t inferType, const char* const* keys,
+                            const char* const* vals, int num,
+                            int* intypes, int num_in, int* outtypes, int num_out) {
+  // create map of attributes from list
+  std::unordered_map<std::string, std::string> attrs;
+  for (int i = 0; i < num; i++) {
+    attrs[std::string(keys[i])] = std::string(vals[i]);
+  }
+
+  // create a vector of types for inputs
+  std::vector<int> in_types(num_in);
+  for (int i = 0; i < num_in; i++) {
+    in_types[i] = intypes[i];
+  }
+
+  // create a vector of types for outputs
+  std::vector<int> out_types(num_out, -1);
+
+  int retval = inferType(attrs, &in_types, &out_types);
+  if (!retval)
+    return retval;
+
+  // copy modified input types
+  for (int i = 0; i < num_in; i++) {
+    intypes[i] = in_types[i];
+  }
+  // copy output types
+  for (int i = 0; i < num_out; i++) {
+    outtypes[i] = out_types[i];
+  }
+
+  return retval;
+}
+
+/*! \brief returns status of calling inferSType function for operator from library */
+MX_INT_RET _opCallInferSType(mxnet::ext::inferSType_t inferSType, const char* const* keys,
+                             const char* const* vals, int num,
+                             int* instypes, int num_in, int* outstypes, int num_out) {
+  // create map of attributes from list
+  std::unordered_map<std::string, std::string> attrs;
+  for (int i = 0; i < num; i++) {
+    attrs[std::string(keys[i])] = std::string(vals[i]);
+  }
+
+  // create a vector of types for inputs
+  std::vector<int> in_stypes(num_in);
+  for (int i = 0; i < num_in; i++) {
+    in_stypes[i] = instypes[i];
+  }
+
+  // create a vector of types for outputs
+  std::vector<int> out_stypes(num_out, -1);
+
+  int retval = inferSType(attrs, &in_stypes, &out_stypes);
+
+  if (!retval)
+    return retval;
+
+  // copy modified input storage types
+  for (int i = 0; i < num_in; i++) {
+    instypes[i] = in_stypes[i];
+  }
+  // copy output storage types
+  for (int i = 0; i < num_out; i++) {
+    outstypes[i] = out_stypes[i];
+  }
+
+  return retval;
+}
+
+/*! \brief returns status of calling Forward/Backward function for operator from library */
+MX_INT_RET _opCallFCompute(mxnet::ext::fcomp_t fcomp, const char* const* keys,
+                           const char* const* vals,
+                           int num, const int64_t** inshapes, int* indims, void** indata,
+                           int* intypes, size_t* inIDs, const char** indev_type, int* indev_id,
+                           int num_in, const int64_t** outshapes, int* outdims, void** outdata,
+                           int* outtypes, size_t* outIDs, const char** outdev_type,
+                           int* outdev_id, int num_out, mxnet::ext::xpu_malloc_t cpu_malloc,
+                           void* cpu_alloc,
+                           mxnet::ext::xpu_malloc_t gpu_malloc, void* gpu_alloc,
+                           void* cuda_stream,
+                           mxnet::ext::sparse_malloc_t sparse_malloc, void* sparse_alloc,
+                           int* instypes, int* outstypes, void** in_indices, void** out_indices,
+                           void** in_indptr, void** out_indptr,
+                           int64_t* in_indices_shapes, int64_t* out_indices_shapes,
+                           int64_t* in_indptr_shapes, int64_t* out_indptr_shapes,
+                           void* rng_cpu_states, void* rng_gpu_states) {
+  // create map of attributes from list
+  std::unordered_map<std::string, std::string> attrs;
+  for (int i = 0; i < num; i++) {
+    attrs[std::string(keys[i])] = std::string(vals[i]);
+  }
+
+  // create a vector of tensors for inputs
+  std::vector<mxnet::ext::MXTensor> inputs(num_in);
+  // create a vector for sparse inputs
+  std::vector<mxnet::ext::MXSparse> in_sparse(num_in);
+
+  for (int i = 0; i < num_in; i++) {
+    // Dense representation.
+    if (instypes[i] == 0) {
+      inputs[i].setTensor(indata[i], (mxnet::ext::MXDType)intypes[i], inshapes[i], indims[i],
+                          inIDs[i], mxnet::ext::MXContext(indev_type[i], indev_id[i]),
+                          mxnet::ext::kDefaultStorage);
+    } else {
+      // Sparse representation.
+      mxnet::ext::MXStorageType type;
+      if (instypes[i] == 1) {
+        type = mxnet::ext::kRowSparseStorage;
+        in_sparse[i].set(indata[i], inshapes[i], indims[i], in_indices[i], in_indices_shapes[i]);
+      } else {
+        type = mxnet::ext::kCSRStorage;
+        in_sparse[i].set(indata[i], inshapes[i], indims[i], in_indices[i],
+                         in_indices_shapes[i], in_indptr[i], in_indptr_shapes[i]);
+      }
+      inputs[i].setTensor(reinterpret_cast<void*>(&in_sparse[i]), (mxnet::ext::MXDType)intypes[i],
+                          inshapes[i], indims[i], inIDs[i],
+                          mxnet::ext::MXContext(indev_type[i], indev_id[i]), type);
+    }
+  }
+
+  // create a vector of tensors for outputs
+  std::vector<mxnet::ext::MXTensor> outputs(num_out);
+  std::vector<mxnet::ext::MXSparse> out_sparse(num_out);
+
+  for (int i = 0; i < num_out; i++) {
+    // Dense representation.
+    if (outstypes[i] == 0) {
+      outputs[i].setTensor(outdata[i], (mxnet::ext::MXDType)outtypes[i], outshapes[i], outdims[i],
+                           outIDs[i], mxnet::ext::MXContext(outdev_type[i], outdev_id[i]),
+                           mxnet::ext::kDefaultStorage);
+    } else {
+      // Sparse representation.
+      mxnet::ext::MXStorageType type;
+      if (outstypes[i] == 1) {
+        type = mxnet::ext::kRowSparseStorage;
+        out_sparse[i].set(outdata[i], outshapes[i], outdims[i],
+                          out_indices[i], out_indices_shapes[i]);
+      } else {
+        type = mxnet::ext::kCSRStorage;
+        out_sparse[i].set(outdata[i], outshapes[i], outdims[i], out_indices[i],
+                          out_indices_shapes[i], out_indptr[i], out_indptr_shapes[i]);
+      }
+      outputs[i].setTensor(reinterpret_cast<void*>(&out_sparse[i]),
+                           (mxnet::ext::MXDType)outtypes[i],
+                           outshapes[i], outdims[i], outIDs[i],
+                           mxnet::ext::MXContext(outdev_type[i], outdev_id[i]), type);
+    }
+  }
+
+  mxnet::ext::OpResource res(cpu_malloc, cpu_alloc, gpu_malloc, gpu_alloc,
+                             cuda_stream, sparse_malloc, sparse_alloc,
+                             rng_cpu_states, rng_gpu_states);
+  return fcomp(attrs, &inputs, &outputs, res);
+}
+
+/*! \brief returns status of calling mutateInputs function for operator from library */
+MX_INT_RET _opCallMutateInputs(mxnet::ext::mutateInputs_t mutate, const char* const* keys,
+                               const char* const* vals, int num,
+                               int** mutate_indices, int* indices_size) {
+  // create map of attributes from list
+  std::unordered_map<std::string, std::string> attrs;
+  for (int i = 0; i < num; i++) {
+    attrs[std::string(keys[i])] = std::string(vals[i]);
+  }
+
+  // create a vector of mutate input indices
+  std::vector<int> mut_ind;
+
+  int retval = mutate(attrs, &mut_ind);
+  if (!retval)
+    return retval;
+
+  // output the input indices
+  *indices_size = mut_ind.size();
+  *mutate_indices = static_cast<int*>(malloc (*indices_size * sizeof(int)));
+  for (int i = 0; i < *indices_size; i++) {
+    (*mutate_indices)[i] = mut_ind[i];
+  }
+
+  return retval;
+}
+
+/*! \brief returns status of calling createStatefulOp function for operator from library */
+MX_INT_RET _opCallCreateOpState(mxnet::ext::createOpState_t create_op, const char* const* keys,
+                                const char* const* vals, int num,
+                                void** state_op) {
+  // create map of attributes from list
+  std::unordered_map<std::string, std::string> attrs;
+  for (int i = 0; i < num; i++) {
+    attrs[std::string(keys[i])] = std::string(vals[i]);
+  }
+
+  // void pointer to hold custom state op instance created in custom library
+  // eventually state_op pointer is populated by instance from custom library
+  mxnet::ext::CustomStatefulOp** op_ptr =
+    reinterpret_cast<mxnet::ext::CustomStatefulOp**>(state_op);
+  return create_op(attrs, op_ptr);
+}
+
+/*! \brief returns status of calling Stateful Forward/Backward for operator from library */
+MX_INT_RET _opCallFStatefulCompute(int is_forward, void* state_op, const int64_t** inshapes,
+                                   int* indims, void** indata, int* intypes, size_t* inIDs,
+                                   const char** indev_type, int* indev_id, int num_in,
+                                   const int64_t** outshapes, int* outdims, void** outdata,
+                                   int* outtypes, size_t* outIDs, const char** outdev_type,
+                                   int* outdev_id, int num_out,
+                                   mxnet::ext::xpu_malloc_t cpu_malloc,
+                                   void* cpu_alloc, mxnet::ext::xpu_malloc_t gpu_malloc,
+                                   void* gpu_alloc,
+                                   void* stream, mxnet::ext::sparse_malloc_t sparse_malloc,
+                                   void* sparse_alloc, int* instypes, int* outstypes,
+                                   void** in_indices, void** out_indices, void** in_indptr,
+                                   void** out_indptr, int64_t* in_indices_shapes,
+                                   int64_t* out_indices_shapes, int64_t* in_indptr_shapes,
+                                   int64_t* out_indptr_shapes,
+                                   void* rng_cpu_states, void* rng_gpu_states) {
+  // create a vector of tensors for inputs
+  std::vector<mxnet::ext::MXTensor> inputs(num_in);
+  // create a vector for sparse inputs
+  std::vector<mxnet::ext::MXSparse> in_sparse(num_in);
+
+  for (int i = 0; i < num_in; i++) {
+    if (instypes[i] == 0) {
+      // Dense representation.
+      inputs[i].setTensor(indata[i], (mxnet::ext::MXDType)intypes[i], inshapes[i], indims[i],
+                          inIDs[i], mxnet::ext::MXContext(indev_type[i], indev_id[i]),
+                          mxnet::ext::kDefaultStorage);
+    } else {
+      // Sparse representation.
+      mxnet::ext::MXStorageType type;
+      if (instypes[i] == 1) {
+        type = mxnet::ext::kRowSparseStorage;
+        in_sparse[i].set(indata[i], inshapes[i], indims[i], in_indices[i], in_indices_shapes[i]);
+      } else {
+        type = mxnet::ext::kCSRStorage;
+        in_sparse[i].set(indata[i], inshapes[i], indims[i], in_indices[i],
+                         in_indices_shapes[i], in_indptr[i], in_indptr_shapes[i]);
+      }
+      inputs[i].setTensor(reinterpret_cast<void*>(&in_sparse[i]), (mxnet::ext::MXDType)intypes[i],
+                          inshapes[i], indims[i], inIDs[i],
+                          mxnet::ext::MXContext(indev_type[i], indev_id[i]), type);
+    }
+  }
+
+  // create a vector of tensors for outputs
+  std::vector<mxnet::ext::MXTensor> outputs(num_out);
+  // create a vector for sparse outputs
+  std::vector<mxnet::ext::MXSparse> out_sparse(num_out);
+
+  for (int i = 0; i < num_out; i++) {
+    if (outstypes[i] == 0) {
+      // Dense representation.
+      outputs[i].setTensor(outdata[i], (mxnet::ext::MXDType)outtypes[i], outshapes[i], outdims[i],
+                           outIDs[i], mxnet::ext::MXContext(outdev_type[i], outdev_id[i]),
+                           mxnet::ext::kDefaultStorage);
+    } else {
+      // Sparse representation.
+      mxnet::ext::MXStorageType type;
+      if (outstypes[i] == 1) {
+        type = mxnet::ext::kRowSparseStorage;
+        out_sparse[i].set(outdata[i], outshapes[i], outdims[i], out_indices[i],
+                          out_indices_shapes[i]);
+      } else {
+        type = mxnet::ext::kCSRStorage;
+        out_sparse[i].set(outdata[i], outshapes[i], outdims[i], out_indices[i],
+                          out_indices_shapes[i], out_indptr[i], out_indptr_shapes[i]);
+      }
+      outputs[i].setTensor(reinterpret_cast<void*>(&out_sparse[i]),
+                           (mxnet::ext::MXDType)outtypes[i],
+                           outshapes[i], outdims[i], outIDs[i],
+                           mxnet::ext::MXContext(outdev_type[i], outdev_id[i]), type);
+    }
+  }
+
+  mxnet::ext::OpResource res(cpu_malloc, cpu_alloc, gpu_malloc, gpu_alloc,
+                             stream, sparse_malloc, sparse_alloc, rng_cpu_states, rng_gpu_states);
+
+  mxnet::ext::CustomStatefulOp* op_ptr =
+    reinterpret_cast<mxnet::ext::CustomStatefulOp*>(state_op);
+  if (is_forward) {
+    return op_ptr->Forward(&inputs, &outputs, res);
+  }
+  return op_ptr->Backward(&inputs, &outputs, res);
+}
+
+/*! \brief returns number of partitioners registered in this library */
+MX_INT_RET _partRegSize() {
+  return mxnet::ext::Registry<mxnet::ext::CustomPartitioner>::get()->size();
+}
+
+/* returns number of strategies registered for partitioner
+ * at specified index */
+MX_INT_RET _partRegGetCount(int idx, const char** name) {
+  mxnet::ext::CustomPartitioner part =
+    mxnet::ext::Registry<mxnet::ext::CustomPartitioner>::get()->get(idx);
+  *name = part.name;
+  return part.strategies.size();
+}
+
+/*! \brief returns partitioner registration at specified index */
+MX_VOID_RET _partRegGet(int part_idx, int stg_idx, const char** strategy,
+                        mxnet::ext::supportedOps_t* supportedOps,
+                        mxnet::ext::createSelector_t* createSelector,
+                        mxnet::ext::reviewSubgraph_t* reviewSubgraph, const char** op_name) {
+  mxnet::ext::CustomPartitioner part =
+    mxnet::ext::Registry<mxnet::ext::CustomPartitioner>::get()->get(part_idx);
+  *strategy = part.strategies[stg_idx];
+  *op_name = part.op_names[stg_idx];
+  *supportedOps = part.getSupportedOps(stg_idx);
+  *createSelector = part.getCreateSelector(stg_idx);
+  *reviewSubgraph = part.getReviewSubgraph(stg_idx);
+}
+
+/*! \brief returns status of calling supported ops function from library */
+MX_INT_RET _partCallSupportedOps(mxnet::ext::supportedOps_t supportedOps, const char *json,
+                                 int num_ids, int *ids, const char* const* opt_keys,
+                                 const char* const* opt_vals, int num_opts) {
+  mxnet::ext::Graph *graph = mxnet::ext::Graph::fromString(json);
+  // create map of options from list
+  std::unordered_map<std::string, std::string> opts;
+  for (int i = 0; i < num_opts; i++)
+    opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
+
+  // create array of subgraph IDs for operator support
+  std::vector<int> _ids(num_ids, -2);
+  // call user's supportedOps function
+  mxnet::ext::MXReturnValue retval = supportedOps(graph, &_ids, opts);
+  if (!retval) return retval;
+
+  // copy bools in ids to ints
+  for (int i = 0; i < num_ids; i++)
+    ids[i] = _ids[i];
+
+  return retval;
+}
+
+/*! \brief returns status of calling create selector function from library */
+MX_INT_RET _partCallCreateSelector(mxnet::ext::createSelector_t createSelector, const char *json,
+                                   void** selector, const char* const* opt_keys,
+                                   const char* const* opt_vals, int num_opts) {
+  mxnet::ext::Graph *graph = mxnet::ext::Graph::fromString(json);
+  // create map of options from list
+  std::unordered_map<std::string, std::string> opts;
+  for (int i = 0; i < num_opts; i++)
+    opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
+
+  // void pointer to hold selector instance created in custom library
+  // eventually pointer is populated by instance from custom library
+  mxnet::ext::CustomOpSelector** sel_ptr =
+    reinterpret_cast<mxnet::ext::CustomOpSelector**>(selector);
+
+  // call user's createSelector function
+  return createSelector(graph, sel_ptr, opts);
+}
+
+/*! \brief returns status of calling select function from library */
+MX_VOID_RET _partCallSelect(void* sel_inst, int nodeID, int* selected) {
+  mxnet::ext::CustomOpSelector* sel_ptr =
+    reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
+  *selected = sel_ptr->Select(nodeID);
+}
+
+/*! \brief returns status of calling select input function from library */
+MX_VOID_RET _partCallSelectInput(void* sel_inst, int nodeID,
+                                 int input_nodeID, int* selected) {
+  mxnet::ext::CustomOpSelector* sel_ptr =
+    reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
+  *selected = sel_ptr->SelectInput(nodeID, input_nodeID);
+}
+
+/*! \brief returns status of calling select output function from library */
+MX_VOID_RET _partCallSelectOutput(void* sel_inst, int nodeID,
+                                  int output_nodeID, int* selected) {
+  mxnet::ext::CustomOpSelector* sel_ptr =
+    reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
+  *selected = sel_ptr->SelectOutput(nodeID, output_nodeID);
+}
+
+/*! \brief returns status of calling filter function from library */
+MX_VOID_RET _partCallFilter(void* sel_inst, int* candidates, int num_candidates,
+                            int** keep, int* num_keep) {
+  mxnet::ext::CustomOpSelector* sel_ptr =
+    reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
+  std::vector<int> candidates_(num_candidates);
+  for (int i=0; i < num_candidates; i++) {
+    candidates_[i] = candidates[i];
+  }
+  std::vector<int> keep_;
+
+  sel_ptr->Filter(candidates_, &keep_);
+
+  *num_keep = keep_.size();
+  *keep = static_cast<int*>(malloc(keep_.size() * sizeof(int)));
+  for (unsigned i=0; i < keep_.size(); i++)
+    (*keep)[i] = keep_[i];
+}
+
+/*! \brief returns status of calling reset selector function from library */
+MX_VOID_RET _partCallReset(void* sel_inst) {
+  mxnet::ext::CustomOpSelector* sel_ptr =
+    reinterpret_cast<mxnet::ext::CustomOpSelector*>(sel_inst);
+    sel_ptr->Reset();
+}
+
+/*! \brief returns status of calling review subgraph function from library */
+MX_INT_RET _partCallReviewSubgraph(mxnet::ext::reviewSubgraph_t reviewSubgraph, const char *json,
+                                   int subgraph_id, int *accept, const char* const* opt_keys,
+                                   const char* const* opt_vals, int num_opts,
+                                   char*** attr_keys, char*** attr_vals, int *num_attrs,
+                                   const char* const* arg_names, int num_args,
+                                   void* const* arg_data, const int64_t* const* arg_shapes,
+                                   const int* arg_dims, const int* arg_types,
+                                   const size_t* arg_IDs, const char* const* arg_dev_type,
+                                   const int* arg_dev_id,
+                                   const char* const* aux_names, int num_aux,
+                                   void* const* aux_data, const int64_t* const* aux_shapes,
+                                   const int* aux_dims, const int* aux_types,
+                                   const size_t* aux_IDs, const char* const* aux_dev_type,
+                                   const int* aux_dev_id) {
+  mxnet::ext::Graph *subgraph = mxnet::ext::Graph::fromString(json);
+  bool accept_bool = false;
+  // create map of attributes from list
+  std::unordered_map<std::string, std::string> opts;
+  for (int i = 0; i < num_opts; i++)
+    opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
+
+  // create a map of named tensors for args
+  std::unordered_map<std::string, mxnet::ext::MXTensor> args;
+  for (int i = 0; i < num_args; i++) {
+    std::vector<int64_t> shapes;
+    for (int j = 0; j < arg_dims[i]; j++)
+      shapes.push_back(arg_shapes[i][j]);
+
+    mxnet::ext::MXTensor tensor(arg_data[i], shapes, (mxnet::ext::MXDType)arg_types[i],
+                                arg_IDs[i], mxnet::ext::MXContext(arg_dev_type[i], arg_dev_id[i]));
+    args[arg_names[i]] = tensor;
+  }
+  // create a map of named tensors for aux
+  std::unordered_map<std::string, mxnet::ext::MXTensor> aux;
+  for (int i = 0; i < num_aux; i++) {
+    std::vector<int64_t> shapes;
+    for (int j = 0; j < aux_dims[i]; j++)
+      shapes.push_back(aux_shapes[i][j]);
+
+    mxnet::ext::MXTensor tensor(aux_data[i], shapes, (mxnet::ext::MXDType)aux_types[i],
+                                aux_IDs[i], mxnet::ext::MXContext(aux_dev_type[i],
+                                                                  aux_dev_id[i]));
+    aux[aux_names[i]] = tensor;
+  }
+
+  subgraph->_setParams(&args, &aux);
+  mxnet::ext::MXReturnValue retval = reviewSubgraph(subgraph, subgraph_id, &accept_bool,
+                                                    opts);
+  if (!retval) return retval;
+
+  *accept = accept_bool;
+
+  if (subgraph->attrs.size() > 0) {
+    *num_attrs = subgraph->attrs.size();
+    // allocate space for attributes
+    *attr_keys = static_cast<char**>(malloc (*num_attrs * sizeof(char*)));
+    *attr_vals = static_cast<char**>(malloc (*num_attrs * sizeof(char*)));
+
+    // copy attributes
+    int i = 0;
+    for (auto kv : subgraph->attrs) {
+      (*attr_keys)[i] = static_cast<char*>(malloc ((kv.first.size()+1) * sizeof(char)));
+      std::string val = kv.second.dump();  // convert JsonVal back to string
+      (*attr_vals)[i] = static_cast<char*>(malloc ((val.size()+1) * sizeof(char)));
+      snprintf((*attr_keys)[i], kv.first.size()+1, "%s", kv.first.c_str());
+      snprintf((*attr_vals)[i], val.size()+1, "%s", val.c_str());
+      i++;
+    }
+  }
+
+  return retval;
+}
+
+/*! \brief returns number of graph passes registered in this library */
+MX_INT_RET _passRegSize() {
+  return mxnet::ext::Registry<mxnet::ext::CustomPass>::get()->size();
+}
+
+/*! \brief returns pass registration at specified index */
+MX_VOID_RET _passRegGet(int pass_idx, mxnet::ext::graphPass_t* graphPass,
+                        const char** pass_name) {
+  mxnet::ext::CustomPass pass =
+    mxnet::ext::Registry<mxnet::ext::CustomPass>::get()->get(pass_idx);
+  *graphPass = pass.pass;
+  *pass_name = pass.name;
+}
+
+/*! \brief returns status of calling graph pass function from library */
+MX_INT_RET _passCallGraphPass(mxnet::ext::graphPass_t graphPass, const char *json,
+                              char** out_graph, const char* const* opt_keys,
+                              const char* const* opt_vals, int num_opts,
+                              const char* pass_name, const char* const* arg_names, int num_args,
+                              void* const* arg_data, const int64_t* const* arg_shapes,
+                              const int* arg_dims, const int* arg_types,
+                              const size_t* arg_IDs, const char* const* arg_dev_type,
+                              const int* arg_dev_id, const char* const* aux_names, int num_aux,
+                              void* const* aux_data, const int64_t* const* aux_shapes,
+                              const int* aux_dims, const int* aux_types,
+                              const size_t* aux_IDs, const char* const* aux_dev_type,
+                              const int* aux_dev_id, mxnet::ext::nd_malloc_t nd_malloc,
+                              const void* nd_alloc) {
+  mxnet::ext::Graph *graph = mxnet::ext::Graph::fromString(json);
+  // create map of attributes from list
+  std::unordered_map<std::string, std::string> opts;
+  for (int i = 0; i < num_opts; i++)
+    opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
+
+  // create a map of named tensors for args
+  std::unordered_map<std::string, mxnet::ext::MXTensor> args;
+  for (int i = 0; i < num_args; i++) {
+    std::vector<int64_t> shapes;
+    for (int j = 0; j < arg_dims[i]; j++)
+      shapes.push_back(arg_shapes[i][j]);
+
+    mxnet::ext::MXTensor tensor(arg_data[i], shapes, (mxnet::ext::MXDType)arg_types[i],
+                                arg_IDs[i], mxnet::ext::MXContext(arg_dev_type[i],
+                                                                  arg_dev_id[i]));
+    args[arg_names[i]] = tensor;
+  }
+  // create a map of named tensors for aux
+  std::unordered_map<std::string, mxnet::ext::MXTensor> aux;
+  for (int i = 0; i < num_aux; i++) {
+    std::vector<int64_t> shapes;
+    for (int j = 0; j < aux_dims[i]; j++)
+      shapes.push_back(aux_shapes[i][j]);
+
+    mxnet::ext::MXTensor tensor(aux_data[i], shapes, (mxnet::ext::MXDType)aux_types[i],
+                                aux_IDs[i], mxnet::ext::MXContext(aux_dev_type[i],
+                                                                  aux_dev_id[i]));
+    aux[aux_names[i]] = tensor;
+  }
+
+  std::unordered_map<std::string, mxnet::ext::MXTensor> new_args, new_aux;
+  mxnet::ext::PassResource res(&new_args, &new_aux, nd_malloc, nd_alloc);
+  graph->_setParams(&args, &aux);
+  graph->_setPassResource(&res);
+  mxnet::ext::MXReturnValue retval = graphPass(graph, opts);
+  if (!retval) return retval;
+
+  std::string *tmp = new std::string(graph->toString());
+  *out_graph = const_cast<char*>(tmp->c_str());
+  return retval;
+}
+
+/*!
+ * \brief Checks if the MXNet version is supported by the library.
+ * If supported, initializes the library.
+ * \param version MXNet version number passed to library and defined as:
+ *                MXNET_VERSION = (MXNET_MAJOR*10000 + MXNET_MINOR*100 + MXNET_PATCH)
+ * \return Non-zero value on error i.e. library incompatible with passed MXNet version
+ */
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+__declspec(dllexport) mxnet::ext::MXReturnValue __cdecl
+#else
+mxnet::ext::MXReturnValue
+#endif
+initialize(int version);
+
+MX_INT_RET _msgSize() {
+  return mxnet::ext::MXerrorMsgs::get()->size();
+}
+
+/*! \brief returns operator registration at specified index */
+MX_VOID_RET _msgGet(int idx, const char** msg) {
+  *msg = mxnet::ext::MXerrorMsgs::get()->get(idx)->c_str();
+}

From d63ac35b07b11f41379f48d556549b816b7de3f6 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Wed, 2 Sep 2020 01:11:38 +0000
Subject: [PATCH 2/8] fixed c++17 downgrade

---
 src/lib_api.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/lib_api.cc b/src/lib_api.cc
index f46c36a32d64..220558a69379 100644
--- a/src/lib_api.cc
+++ b/src/lib_api.cc
@@ -37,7 +37,7 @@ mxnet::ext::MXerrorMsgs* mxnet::ext::MXerrorMsgs::get() {
   }
 
 std::stringstream& mxnet::ext::MXerrorMsgs::add(const char* file, int line) {
-  messages.emplace_back();
+  messages.push_back(std::stringstream());
   messages.back() << file << "[" << line << "]: ";
   return messages.back();
 }
@@ -491,7 +491,7 @@ mxnet::ext::Graph* mxnet::ext::Graph::fromJson(mxnet::ext::JsonVal val) {
 
   std::map<int, Node*> nodeMap;
   // loop over nodes
-  for (int i = 0; i < nodes.list.size(); i++) {
+  for (size_t i = 0; i < nodes.list.size(); i++) {
     Node* n = new Node();
     g->nodes.push_back(n);
     JsonVal node = nodes.list[i];
@@ -521,7 +521,7 @@ mxnet::ext::Graph* mxnet::ext::Graph::fromJson(mxnet::ext::JsonVal val) {
     // set node inputs
     JsonVal node_inputs = node.map[JsonVal("inputs")];
     n->inputs.resize(node_inputs.list.size());
-    for (int j = 0; j < node_inputs.list.size(); j++) {
+    for (size_t j = 0; j < node_inputs.list.size(); j++) {
       JsonVal input = node_inputs.list[j];
       NodeEntry& entry = n->inputs[j];
       // get pointer to other node
@@ -537,7 +537,7 @@ mxnet::ext::Graph* mxnet::ext::Graph::fromJson(mxnet::ext::JsonVal val) {
   // set graph level outputs
   JsonVal& heads = val.map[JsonVal("heads")];
   g->outputs.resize(heads.list.size());
-  for (int i = 0; i < heads.list.size(); i++) {
+  for (size_t i = 0; i < heads.list.size(); i++) {
     JsonVal head = heads.list[i];
     g->outputs[i].node = nodeMap[head.list[0].num];
     g->outputs[i].entry = head.list[1].num;
@@ -577,7 +577,7 @@ mxnet::ext::JsonVal mxnet::ext::Graph::toJson() {
   // create node_row_ptr entry
   val.map[JsonVal("node_row_ptr")] = JsonVal(LIST);
   JsonVal& node_row_ptr = val.map[JsonVal("node_row_ptr")];
-  for (int i = 0; i < nodes.size(); i++)
+  for (size_t i = 0; i < nodes.size(); i++)
     node_row_ptr.list.emplace_back(i);
 
   // add all input nodes
@@ -589,7 +589,7 @@ mxnet::ext::JsonVal mxnet::ext::Graph::toJson() {
   // add all output nodes
   val.map[JsonVal("heads")] = JsonVal(LIST);
   JsonVal& heads = val.map[JsonVal("heads")];
-  for (int i = 0; i < outputs.size(); i++) {
+  for (size_t i = 0; i < outputs.size(); i++) {
     heads.list.emplace_back(LIST);
     JsonVal& out = heads.list[i];
     out.list.emplace_back(nodeMap[outputs[i].node]);
@@ -612,7 +612,7 @@ mxnet::ext::JsonVal mxnet::ext::Graph::toJson() {
 
     // add inputs for this node
     JsonVal& inputs_ = n_.map[JsonVal("inputs")];
-    for (int j = 0; j < n->inputs.size(); j++) {
+    for (size_t j = 0; j < n->inputs.size(); j++) {
       inputs_.list.emplace_back(LIST);
       NodeEntry& entry = n->inputs[j];
       JsonVal& in = inputs_.list[j];

From 12651a93b36c1ef2173e3f1d2986fdd74c0f0f01 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Wed, 2 Sep 2020 01:54:40 +0000
Subject: [PATCH 3/8] fixed stringstream

---
 src/lib_api.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lib_api.cc b/src/lib_api.cc
index 220558a69379..ad8fcc4ddb37 100644
--- a/src/lib_api.cc
+++ b/src/lib_api.cc
@@ -37,7 +37,7 @@ mxnet::ext::MXerrorMsgs* mxnet::ext::MXerrorMsgs::get() {
   }
 
 std::stringstream& mxnet::ext::MXerrorMsgs::add(const char* file, int line) {
-  messages.push_back(std::stringstream());
+  messages.resize(messages.size()+1);
   messages.back() << file << "[" << line << "]: ";
   return messages.back();
 }
@@ -529,7 +529,7 @@ mxnet::ext::Graph* mxnet::ext::Graph::fromJson(mxnet::ext::JsonVal val) {
       // get the other node's output index
       entry.entry = input.list[1].num;
       // set other nodes output as connected to this node
-      entry.node->outputs.push_back({n, j});
+      entry.node->outputs.push_back({n, (int)j});
     }
     nodeMap[i] = n;
   }

From 5ca875d8293926359fd88595bc78720d4cd13ef6 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Wed, 2 Sep 2020 03:00:10 +0000
Subject: [PATCH 4/8] fixed cast

---
 src/lib_api.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lib_api.cc b/src/lib_api.cc
index ad8fcc4ddb37..8be4423d4000 100644
--- a/src/lib_api.cc
+++ b/src/lib_api.cc
@@ -529,7 +529,7 @@ mxnet::ext::Graph* mxnet::ext::Graph::fromJson(mxnet::ext::JsonVal val) {
       // get the other node's output index
       entry.entry = input.list[1].num;
       // set other nodes output as connected to this node
-      entry.node->outputs.push_back({n, (int)j});
+      entry.node->outputs.push_back({n, static_cast<int>(j)});
     }
     nodeMap[i] = n;
   }

From a2647af60c90c50ffabdeb0a8053338418b039bf Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Wed, 2 Sep 2020 03:59:04 +0000
Subject: [PATCH 5/8] changed to use pointers for stringstream since not
 copyable

---
 include/mxnet/lib_api.h |  4 ++--
 src/lib_api.cc          | 13 +++++++++----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 57267d420a33..1ac45ba81b49 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -236,9 +236,9 @@ class MXerrorMsgs {
   /*! \brief constructor */
   MXerrorMsgs() {}
   /*! \brief destructor */
-  ~MXerrorMsgs() {}
+  ~MXerrorMsgs();
   /*! \brief map of entries in registry */
-  std::vector<std::stringstream> messages;
+  std::vector<std::stringstream*> messages;
 };
 
 // Add a new error message, example: MX_ERROR_MSG << "my error msg";
diff --git a/src/lib_api.cc b/src/lib_api.cc
index 8be4423d4000..abb8e9beb279 100644
--- a/src/lib_api.cc
+++ b/src/lib_api.cc
@@ -31,15 +31,20 @@
 
 #include <mxnet/lib_api.h>
 
+mxnet::ext::MXerrorMsgs::~MXerrorMsgs() {
+  for (auto &ss : messages)
+    delete ss;
+}
+
 mxnet::ext::MXerrorMsgs* mxnet::ext::MXerrorMsgs::get() {
     static MXerrorMsgs inst;
     return &inst;
   }
 
 std::stringstream& mxnet::ext::MXerrorMsgs::add(const char* file, int line) {
-  messages.resize(messages.size()+1);
-  messages.back() << file << "[" << line << "]: ";
-  return messages.back();
+  messages.push_back(new std::stringstream());
+  *messages.back() << file << "[" << line << "]: ";
+  return *messages.back();
 }
 
 int mxnet::ext::MXerrorMsgs::size() {
@@ -47,7 +52,7 @@ int mxnet::ext::MXerrorMsgs::size() {
 }
 
 const std::string* mxnet::ext::MXerrorMsgs::get(int idx) {
-  return new std::string(messages.at(idx).str());
+  return new std::string(messages.at(idx)->str());
 }
 
 mxnet::ext::MXContext::MXContext() : dev_type("error"), dev_id(-1) {}

From 9f2657a6d993b2f40c3db0a1d50a37e451791674 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Wed, 2 Sep 2020 05:11:31 +0000
Subject: [PATCH 6/8] fixed includes

---
 CMakeLists.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7704cb7ee9db..b8adddca6712 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -740,14 +740,14 @@ add_library(transposecsr_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensio
 add_library(transposerowsp_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/transposerowsp_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
 add_library(subgraph_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_subgraph/subgraph_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
 add_library(pass_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_pass/pass_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
-target_include_directories(customop_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
-target_include_directories(transposecsr_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
-target_include_directories(transposerowsp_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
-target_include_directories(subgraph_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
-target_include_directories(pass_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
+target_include_directories(customop_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
+target_include_directories(transposecsr_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
+target_include_directories(transposerowsp_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
+target_include_directories(subgraph_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
+target_include_directories(pass_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
 if(USE_CUDA)
   add_library(customop_gpu_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/relu_lib.cu ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
-  target_include_directories(customop_gpu_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
+  target_include_directories(customop_gpu_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
 endif()
 if(UNIX)
   if (USE_CUDA)

From 8f2280427ca2f46e4c4b312f77e6a34a1254610b Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Wed, 2 Sep 2020 05:53:59 +0000
Subject: [PATCH 7/8] fixed makefile includes

---
 Makefile | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/Makefile b/Makefile
index 8e2bc8b766c5..6a0a14484ad3 100644
--- a/Makefile
+++ b/Makefile
@@ -677,28 +677,28 @@ extension_libs: $(EXT_LIBS)
 
 build/libcustomop_lib.so:
 	@mkdir -p $(@D)
-	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_custom_op/gemm_lib.cc src/lib_api.cc -o $@ -I include/mxnet
+	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_custom_op/gemm_lib.cc src/lib_api.cc -o $@ -I include
 build/libcustomop_gpu_lib.so:
 	@mkdir -p $(@D)
-	$(NVCC) -shared -std=c++11 -Xcompiler -fPIC example/extensions/lib_custom_op/relu_lib.cu src/lib_api.cc -o $@ -I include/mxnet
+	$(NVCC) -shared -std=c++11 -Xcompiler -fPIC example/extensions/lib_custom_op/relu_lib.cu src/lib_api.cc -o $@ -I include
 build/libsubgraph_lib.so:
 	@mkdir -p $(@D)
-	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_subgraph/subgraph_lib.cc src/lib_api.cc -o $@ -I include/mxnet
+	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_subgraph/subgraph_lib.cc src/lib_api.cc -o $@ -I include
 build/libtransposecsr_lib.so:
 	@mkdir -p $(@D)
-	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_custom_op/transposecsr_lib.cc src/lib_api.cc -o $@ -I include/mxnet
+	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_custom_op/transposecsr_lib.cc src/lib_api.cc -o $@ -I include
 build/libtransposerowsp_lib.so:
 	@mkdir -p $(@D)
-	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_custom_op/transposerowsp_lib.cc src/lib_api.cc -o $@ -I include/mxnet
+	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_custom_op/transposerowsp_lib.cc src/lib_api.cc -o $@ -I include
 build/libcustomop_gpu_lib.so:
 	@mkdir -p $(@D)
-	$(NVCC) -shared -std=c++11 -Xcompiler -fPIC example/extensions/lib_custom_op/relu_lib.cu src/lib_api.cc -o $@ -I include/mxnet
+	$(NVCC) -shared -std=c++11 -Xcompiler -fPIC example/extensions/lib_custom_op/relu_lib.cu src/lib_api.cc -o $@ -I include
 build/libsubgraph_lib.so:
 	@mkdir -p $(@D)
-	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_subgraph/subgraph_lib.cc src/lib_api.cc -o $@ -I include/mxnet
+	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_subgraph/subgraph_lib.cc src/lib_api.cc -o $@ -I include
 build/libpass_lib.so:
 	@mkdir -p $(@D)
-	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_pass/pass_lib.cc src/lib_api.cc -o $@ -I include/mxnet
+	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_pass/pass_lib.cc src/lib_api.cc -o $@ -I include
 
 # Cython build
 cython:

From 65ce9d728e2a8ea50aeda4a68774688e25314981 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
Date: Wed, 2 Sep 2020 21:23:33 +0000
Subject: [PATCH 8/8] skipped lint for malloc/free for passing across C ABI

---
 src/lib_api.cc | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/lib_api.cc b/src/lib_api.cc
index abb8e9beb279..8255095255f5 100644
--- a/src/lib_api.cc
+++ b/src/lib_api.cc
@@ -954,7 +954,7 @@ MX_VOID_RET _opRegGet(int idx, const char** name, int *isSGop,
 
 /*! \brief calls free from the external library for library allocated arrays */
 MX_VOID_RET _opCallFree(void* ptr) {
-  free(ptr);
+  free(ptr);  // NOLINT
 }
 
 /*! \brief returns status of calling parse attributes function for operator from library */
@@ -996,26 +996,27 @@ MX_INT_RET _opCallInferShape(mxnet::ext::inferShape_t inferShape, const char* co
   if (!retval) return retval;
 
   // allocate space for modified input dims, shape
-  *mod_indims = static_cast<int*>(malloc (num_in * sizeof(int)));
-  *mod_inshapes = static_cast<unsigned**>(malloc (num_in * sizeof(unsigned*)));
+  *mod_indims = static_cast<int*>(malloc (num_in * sizeof(int)));  // NOLINT
+  *mod_inshapes = static_cast<unsigned**>(malloc (num_in * sizeof(unsigned*)));  // NOLINT
 
   // copy modified input shapes
   for (int i = 0; i < num_in; i++) {
     (*mod_indims)[i] = in_shapes[i].size();
-    (*mod_inshapes)[i] = static_cast<unsigned*>(malloc ((*mod_indims)[i] * sizeof(unsigned)));
+    (*mod_inshapes)[i] = static_cast<unsigned*>(
+                           malloc ((*mod_indims)[i] * sizeof(unsigned)));  // NOLINT
     for (int j = 0; j < (*mod_indims)[i]; j++) {
       (*mod_inshapes)[i][j] = in_shapes[i][j];
     }
   }
 
   // allocate space for output dims, shape
-  *outdims = static_cast<int*>(malloc (num_out * sizeof(int)));
-  *outshapes = static_cast<unsigned**>(malloc (num_out * sizeof(unsigned*)));
+  *outdims = static_cast<int*>(malloc (num_out * sizeof(int)));  // NOLINT
+  *outshapes = static_cast<unsigned**>(malloc (num_out * sizeof(unsigned*)));  // NOLINT
 
   // copy output shapes
   for (int i = 0; i < num_out; i++) {
     (*outdims)[i] = out_shapes[i].size();
-    (*outshapes)[i] = static_cast<unsigned*>(malloc ((*outdims)[i] * sizeof(unsigned)));
+    (*outshapes)[i] = static_cast<unsigned*>(malloc ((*outdims)[i] * sizeof(unsigned)));  // NOLINT
     for (int j = 0; j < (*outdims)[i]; j++) {
       (*outshapes)[i][j] = out_shapes[i][j];
     }
@@ -1199,7 +1200,7 @@ MX_INT_RET _opCallMutateInputs(mxnet::ext::mutateInputs_t mutate, const char* co
 
   // output the input indices
   *indices_size = mut_ind.size();
-  *mutate_indices = static_cast<int*>(malloc (*indices_size * sizeof(int)));
+  *mutate_indices = static_cast<int*>(malloc (*indices_size * sizeof(int)));  // NOLINT
   for (int i = 0; i < *indices_size; i++) {
     (*mutate_indices)[i] = mut_ind[i];
   }
@@ -1417,7 +1418,7 @@ MX_VOID_RET _partCallFilter(void* sel_inst, int* candidates, int num_candidates,
   sel_ptr->Filter(candidates_, &keep_);
 
   *num_keep = keep_.size();
-  *keep = static_cast<int*>(malloc(keep_.size() * sizeof(int)));
+  *keep = static_cast<int*>(malloc(keep_.size() * sizeof(int)));  // NOLINT
   for (unsigned i=0; i < keep_.size(); i++)
     (*keep)[i] = keep_[i];
 }
@@ -1485,15 +1486,15 @@ MX_INT_RET _partCallReviewSubgraph(mxnet::ext::reviewSubgraph_t reviewSubgraph,
   if (subgraph->attrs.size() > 0) {
     *num_attrs = subgraph->attrs.size();
     // allocate space for attributes
-    *attr_keys = static_cast<char**>(malloc (*num_attrs * sizeof(char*)));
-    *attr_vals = static_cast<char**>(malloc (*num_attrs * sizeof(char*)));
+    *attr_keys = static_cast<char**>(malloc (*num_attrs * sizeof(char*)));  // NOLINT
+    *attr_vals = static_cast<char**>(malloc (*num_attrs * sizeof(char*)));  // NOLINT
 
     // copy attributes
     int i = 0;
     for (auto kv : subgraph->attrs) {
-      (*attr_keys)[i] = static_cast<char*>(malloc ((kv.first.size()+1) * sizeof(char)));
+      (*attr_keys)[i] = static_cast<char*>(malloc ((kv.first.size()+1) * sizeof(char)));  // NOLINT
       std::string val = kv.second.dump();  // convert JsonVal back to string
-      (*attr_vals)[i] = static_cast<char*>(malloc ((val.size()+1) * sizeof(char)));
+      (*attr_vals)[i] = static_cast<char*>(malloc ((val.size()+1) * sizeof(char)));  // NOLINT
       snprintf((*attr_keys)[i], kv.first.size()+1, "%s", kv.first.c_str());
       snprintf((*attr_vals)[i], val.size()+1, "%s", val.c_str());
       i++;