From f55609b4a19ed7166d9b4dbbee4acd48af3697ac Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Sun, 28 Jul 2019 18:41:10 -0700
Subject: [PATCH] [VTA] Refactor to increase platform coverage (Ultra96 etc.)
 (#3496)

* hardware refactor for increased FPGA coverage, small optimizations

* fix header

* cleaning up parameters that won't be needed for now

* streamlining makefile, and simplifying tcl scripts

* moving parameter derivation into pkg_config.py, keeping tcl scripts lightweight

* refactoring tcl script to avoid global variables

* deriving AXI signals in pkg_config.py

* unifying address map definition for hardware and software drivers

* single channel design for ultra96 to simplify build

* enable alu by default, no mul opcode for now

* hardware fix

* new bitstream; vta version

* avoid error when env variable is not set

* ultra96 cleanup

* further cleaning up tcl script for bitstream generation

* preliminary rpc server support on ultra96

* rpc server tracker scripts

* ultra96 ldflag

* ultra96 support

* ultra96 support

* cleanup line

* cmake support for ultra96

* simplify memory instantiation

* cleaning up IP parameter initialization

* fix queue instantiation

* 2019.1 transition

* fix macro def

* removing bus width from config

* cleanup

* fix

* turning off testing for now

* cleanup ultra96 ps insantiation

* minor refactor

* adding comments

* upgrading to tophub v0.6

* model used in TVM target now refers to a specific version of VTA for better autoTVM scheduling

* revert change due to bug

* rename driver files to be for zynq-type devices

* streamlining address mapping

* unifying register map offset values between driver and hardware generator

* rely on cma library for cache flush/invalidation

* coherence management

* not make buffer packing depend on data types that can be wider than 64bits

* refactor config derivation to minimize free parameters

* fix environment/pkg config interaction

* adding cfg dump property to pkgconfig:

* fix rpc reconfig

* fix spacing

* cleanup

* fix spacing

* long line fix

* fix spacing and lint

* fix line length

* cmake fix

* environment fix

* renaming after pynq since the driver stack relies on the pynq library - see pynq.io

* update doc

* adding parameterization to  name

* space

* removing reg width

* vta RPC

* update doc on how to edit vta_config.json

* fix path

* fix path
---
 .../{pynq_rpc => vta_rpc}/start_rpc_server.sh |    0
 .../start_rpc_server_to_tracker.py}           |    5 +-
 cmake/modules/VTA.cmake                       |   17 +-
 docs/vta/dev/config.rst                       |   23 +-
 docs/vta/install.md                           |   25 +-
 python/tvm/autotvm/tophub.py                  |    2 +-
 vta/config/pynq_sample.json                   |   10 +-
 vta/config/ultra96_sample.json                |   13 +
 vta/config/vta_config.json                    |    8 +-
 vta/config/vta_config.py                      |  185 +--
 vta/hardware/xilinx/Makefile                  |  110 +-
 vta/hardware/xilinx/scripts/hls.tcl           |  293 ++--
 vta/hardware/xilinx/scripts/vivado.tcl        | 1193 +++++------------
 vta/hardware/xilinx/sim/vta_test.cc           |   17 +-
 vta/hardware/xilinx/src/vta.cc                |  851 ++++++------
 vta/hardware/xilinx/src/vta.h                 |   65 +-
 vta/include/vta/driver.h                      |   16 +-
 vta/include/vta/hw_spec.h                     |  247 +---
 vta/python/vta/bitstream.py                   |    5 +-
 vta/python/vta/environment.py                 |   53 +-
 vta/python/vta/pkg_config.py                  |  194 ++-
 vta/python/vta/program_bitstream.py           |    7 +-
 vta/python/vta/rpc_client.py                  |    2 +-
 vta/python/vta/testing/util.py                |   17 +-
 vta/src/pynq/pynq_driver.cc                   |   68 +-
 vta/src/pynq/pynq_driver.h                    |   46 +-
 vta/src/runtime.cc                            |   32 +-
 vta/src/sim/sim_driver.cc                     |    4 +-
 vta/src/tsim/tsim_driver.cc                   |    4 +-
 vta/tests/hardware/common/test_lib.cc         |  405 +++---
 vta/tests/hardware/common/test_lib.h          |   16 +-
 vta/tests/python/unittest/test_environment.py |    2 +-
 .../frontend/deploy_resnet_on_vta.py          |    4 +-
 33 files changed, 1610 insertions(+), 2329 deletions(-)
 rename apps/{pynq_rpc => vta_rpc}/start_rpc_server.sh (100%)
 rename apps/{pynq_rpc/start_rpc_server_to_tracker.sh => vta_rpc/start_rpc_server_to_tracker.py} (83%)
 create mode 100644 vta/config/ultra96_sample.json

diff --git a/apps/pynq_rpc/start_rpc_server.sh b/apps/vta_rpc/start_rpc_server.sh
similarity index 100%
rename from apps/pynq_rpc/start_rpc_server.sh
rename to apps/vta_rpc/start_rpc_server.sh
diff --git a/apps/pynq_rpc/start_rpc_server_to_tracker.sh b/apps/vta_rpc/start_rpc_server_to_tracker.py
similarity index 83%
rename from apps/pynq_rpc/start_rpc_server_to_tracker.sh
rename to apps/vta_rpc/start_rpc_server_to_tracker.py
index f1b906327add..fd2998efe095 100755
--- a/apps/pynq_rpc/start_rpc_server_to_tracker.sh
+++ b/apps/vta_rpc/start_rpc_server_to_tracker.py
@@ -17,7 +17,10 @@
 # under the License.
 PROJROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../" && pwd )"
 
+# Derive target specified by vta_config.json
+VTA_CONFIG=${PROJROOT}/vta/config/vta_config.py
+TARGET=$(python ${VTA_CONFIG} --target)
 
 export PYTHONPATH=${PYTHONPATH}:${PROJROOT}/python:${PROJROOT}/vta/python
 export PYTHONPATH=${PYTHONPATH}:/home/xilinx/pynq
-python3 -m vta.exec.rpc_server --tracker fleet:9190 --key pynq
+python3 -m vta.exec.rpc_server --tracker fleet:9190 --key $TARGET
diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
index 6d5ea000edc2..bae8d458d298 100644
--- a/cmake/modules/VTA.cmake
+++ b/cmake/modules/VTA.cmake
@@ -38,11 +38,16 @@ elseif(PYTHON)
   string(REGEX MATCHALL "(^| )-D[A-Za-z0-9_=.]*" VTA_DEFINITIONS "${__vta_defs}")
 
   file(GLOB VTA_RUNTIME_SRCS vta/src/*.cc)
-  file(GLOB __vta_target_srcs vta/src/${VTA_TARGET}/*.cc)
+  # Add sim driver sources
+  if(${VTA_TARGET} STREQUAL "sim")
+    file(GLOB __vta_target_srcs vta/src/sim/*.cc)
+  endif()
+  # Add pynq driver sources
+  if(${VTA_TARGET} STREQUAL "pynq" OR ${VTA_TARGET} STREQUAL "ultra96")
+    file(GLOB __vta_target_srcs vta/src/pynq/*.cc)
+  endif()
   list(APPEND VTA_RUNTIME_SRCS ${__vta_target_srcs})
-
-  add_library(vta SHARED ${VTA_RUNTIME_SRCS})
-
+  # Add tsim driver sources
   if(${VTA_TARGET} STREQUAL "tsim")
     target_compile_definitions(vta PUBLIC USE_TSIM)
     include_directories("vta/include")
@@ -50,6 +55,8 @@ elseif(PYTHON)
     list(APPEND RUNTIME_SRCS ${RUNTIME_DPI_SRCS})
   endif()
 
+  add_library(vta SHARED ${VTA_RUNTIME_SRCS})
+
   target_include_directories(vta PUBLIC vta/include)
 
   foreach(__def ${VTA_DEFINITIONS})
@@ -62,7 +69,7 @@ elseif(PYTHON)
   endif(APPLE)
 
   # PYNQ rules for Pynq v2.4
-  if(${VTA_TARGET} STREQUAL "pynq")
+  if(${VTA_TARGET} STREQUAL "pynq" OR ${VTA_TARGET} STREQUAL "ultra96")
     find_library(__cma_lib NAMES cma PATH /usr/lib)
     target_link_libraries(vta ${__cma_lib})
   endif()
diff --git a/docs/vta/dev/config.rst b/docs/vta/dev/config.rst
index 0ca6b99759c0..f4b5bcec8af1 100644
--- a/docs/vta/dev/config.rst
+++ b/docs/vta/dev/config.rst
@@ -36,10 +36,6 @@ below.
 +=======================+============+========================================================+
 | ``TARGET``            | String     | The TVM device target.                                 |
 +-----------------------+------------+--------------------------------------------------------+
-| ``HW_TARGET``         | Int        | FPGA frequency in MHz.                                 |
-+-----------------------+------------+--------------------------------------------------------+
-| ``HW_CLK_TARGET``     | Int        | FPGA clock period in ns target for HLS tool.           |
-+-----------------------+------------+--------------------------------------------------------+
 | ``HW_VER``            | String     | VTA hardware version number.                           |
 +-----------------------+------------+--------------------------------------------------------+
 | ``LOG_INP_WIDTH``     | Int (log2) | Input data type signed integer width.                  |
@@ -48,13 +44,9 @@ below.
 +-----------------------+------------+--------------------------------------------------------+
 | ``LOG_ACC_WIDTH``     | Int (log2) | Accumulator data type signed integer width.            |
 +-----------------------+------------+--------------------------------------------------------+
-| ``LOG_OUT_WIDTH``     | Int (log2) | Output data type signed integer width.                 |
-+-----------------------+------------+--------------------------------------------------------+
-| ``LOG_BATCH``         | Int (log2) | VTA matrix multiply intrinsic output dimension 0.      |
-+-----------------------+------------+--------------------------------------------------------+
-| ``LOG_BLOCK_IN``      | Int (log2) | VTA matrix multiply reduction dimension.               |
+| ``LOG_BATCH``         | Int (log2) | VTA matrix multiply intrinsic input/output dimension 0.|
 +-----------------------+------------+--------------------------------------------------------+
-| ``LOG_BLOCK_OUT``     | Int (log2) | VTA matrix multiply intrinsic output dimension 1.      |
+| ``LOG_BLOCK``         | Int (log2) | VTA matrix multiply inner dimensions.                  |
 +-----------------------+------------+--------------------------------------------------------+
 | ``LOG_UOP_BUFF_SIZE`` | Int (log2) | Micro-op on-chip buffer in Bytes.                      |
 +-----------------------+------------+--------------------------------------------------------+
@@ -75,13 +67,8 @@ below.
 
 We provide additional detail below regarding each parameter:
 
- - ``TARGET``: Can be set to ``"pynq"`` or ``"sim"``.
- - ``HW_TARGET``: In pynq mode, can be set to ``100``, ``142``, ``167``, or ``200`` MHz.
- - ``HW_CLK_TARGET``: The lower the target, the more pipeline stages HLS will insert to achieve timing closure during place and route (this can also slightly decrease performance).
+ - ``TARGET``: Can be set to ``"pynq"``, ``"ultra96"``, ``"sim"`` (fast simulator), or ``"tsim"`` (cycle accurate sim with verilator).
  - ``HW_VER``: Hardware version which increments everytime the VTA hardware design changes. This parameter is used to uniquely idenfity hardware bitstreams.
- - ``LOG_OUT_WIDTH``: We recommend matching ``LOG_OUT_WIDTH`` to ``LOG_INP_WIDTH``.
- - ``LOG_BATCH``: Equivalent to A in multiplication of shape (A, B) x (B, C), or typically, the batch dimension.
- - ``LOG_BATCH``: Equivalent to A in multiplication of shape (A, B) x (B, C), or typically, the batch dimension.
- - ``LOG_BLOCK_IN``: Equivalent to B in multiplication of shape (A, B) x (B, C), or typically, the input channel dimension.
- - ``LOG_BLOCK_OUT``: Equivalent to C in multiplication of shape (A, B) x (B, C), or typically, the output channel dimension.
+ - ``LOG_BATCH``: Equivalent to A in multiplication of shape (A, B) x (B, C), or typically, the batch dimension of inner tensor computation.
+ - ``LOG_BLOCK``: Equivalent to B and C in multiplication of shape (A, B) x (B, C), or typically, the input/output channel dimensions of the innter tensor computation.
 
diff --git a/docs/vta/install.md b/docs/vta/install.md
index 6c87b4edd288..2583e331ecd5 100644
--- a/docs/vta/install.md
+++ b/docs/vta/install.md
@@ -61,7 +61,7 @@ To do so,
 
 ```bash
 cd <tvm root>
-cp vta/config/vta_config.json vta_config.json
+vim vta/config/vta_config.json
 # edit vta_config.json
 make vta
 ```
@@ -118,7 +118,7 @@ cd /home/xilinx/tvm
 mkdir build
 cp cmake/config.cmake build/.
 # Copy pynq specific configuration
-cp vta/config/pynq_sample.json build/vta_config.json
+cp vta/config/pynq_sample.json vta/config/vta_config.json
 cd build
 cmake ..
 make runtime vta -j2
@@ -147,13 +147,12 @@ export VTA_PYNQ_RPC_PORT=9091
 ```
 
 In addition, you'll need to edit the `vta_config.json` file on the host to indicate that we are targeting the Pynq platform, by setting the `TARGET` field to `"pynq"`.
-Alternatively, you can copy the default `vta/config/pynq_sample.json` into the TVM root as `vta_config.json`.
 > Note: in contrast to our simulation setup, there are no libraries to compile on the host side since the host offloads all of the computation to the Pynq board.
 
 ```bash
 # On the Host-side
 cd <tvm root>
-cp vta/config/pynq_sample.json vta_config.json
+cp vta/config/pynq_sample.json vta/config/vta_config.json
 ```
 
 This time again, we will run the 2D convolution testbench.
@@ -187,28 +186,28 @@ This third and last guide allows users to generate custom VTA bitstreams using f
 
 ### Xilinx Toolchain Installation
 
-We recommend using `Vivado 2018.2` since our scripts have been tested to work on this version of the Xilinx toolchains.
+We recommend using `Vivado 2019.1` since our scripts have been tested to work on this version of the Xilinx toolchains.
 Our guide is written for Linux (Ubuntu) installation.
 
-You’ll need to install Xilinx’ FPGA compilation toolchain, [Vivado HL WebPACK 2018.2](https://www.xilinx.com/products/design-tools/vivado.html), which a license-free version of the Vivado HLx toolchain.
+You’ll need to install Xilinx’ FPGA compilation toolchain, [Vivado HL WebPACK 2019.1](https://www.xilinx.com/products/design-tools/vivado.html), which a license-free version of the Vivado HLx toolchain.
 
 #### Obtaining and Launching the Vivado GUI Installer
 
-1. Go to the [download webpage](https://www.xilinx.com/support/download/index.html/content/xilinx/en/downloadNav/vivado-design-tools/2018-2.html), and download the Linux Self Extracting Web Installer for Vivado HLx 2018.2: WebPACK and Editions.
+1. Go to the [download webpage](https://www.xilinx.com/support/download/index.html/content/xilinx/en/downloadNav/vivado-design-tools/2019-1.html), and download the Linux Self Extracting Web Installer for Vivado HLx 2019.1: WebPACK and Editions.
 2. You’ll have to sign in with a Xilinx account. This requires a Xilinx account creation that will take 2 minutes.
-3. Complete the Name and Address Verification by clicking “Next”, and you will get the opportunity to download a binary file, called `Xilinx_Vivado_SDK_Web_2018.2_0614_1954_Lin64.bin`.
+3. Complete the Name and Address Verification by clicking “Next”, and you will get the opportunity to download a binary file, called `Xilinx_Vivado_SDK_Web_2019.1_0524_1430_Lin64.bin`.
 4. Now that the file is downloaded, go to your `Downloads` directory, and change the file permissions so it can be executed:
 ```bash
-chmod u+x Xilinx_Vivado_SDK_Web_2018.2_0614_1954_Lin64.bin
+chmod u+x Xilinx_Vivado_SDK_Web_2019.1_0524_1430_Lin64.bin
 ```
 5. Now you can execute the binary:
 ```bash
-./Xilinx_Vivado_SDK_Web_2018.2_0614_1954_Lin64.bin
+./Xilinx_Vivado_SDK_Web_2019.1_0524_1430_Lin64.bin
 ```
 
 #### Xilinx Vivado GUI Installer Steps
 
-At this point you've launched the Vivado 2018.2 Installer GUI program.
+At this point you've launched the Vivado 2019.1 Installer GUI program.
 
 1. Click “Next” on the *Welcome* screen.
 2. On the *Select Install Type* screen, enter your Xilinx user credentials under the “User Authentication” box and select the “Download and Install Now” option before clicking “Next” .
@@ -230,8 +229,8 @@ At this point you've launched the Vivado 2018.2 Installer GUI program.
 
 The last step is to update your `~/.bashrc` with the following lines. This will include all of the Xilinx binary paths so you can launch compilation scripts from the command line.
 ```bash
-# Xilinx Vivado 2018.2 environment
-export XILINX_VIVADO=${XILINX_PATH}/Vivado/2018.2
+# Xilinx Vivado 2019.1 environment
+export XILINX_VIVADO=${XILINX_PATH}/Vivado/2019.1
 export PATH=${XILINX_VIVADO}/bin:${PATH}
 ```
 
diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index 37a95d6f774d..0130384c2e69 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -44,7 +44,7 @@
     'opencl':  "v0.02",
     'mali':    "v0.05",
 
-    'vta':     "v0.05",
+    'vta':     "v0.06",
 }
 
 logger = logging.getLogger('autotvm')
diff --git a/vta/config/pynq_sample.json b/vta/config/pynq_sample.json
index 5c37108e6b12..380984a28972 100644
--- a/vta/config/pynq_sample.json
+++ b/vta/config/pynq_sample.json
@@ -1,17 +1,13 @@
 {
   "TARGET" : "pynq",
-  "HW_FREQ" : 100,
-  "HW_CLK_TARGET" : 8,
-  "HW_VER" : "0.0.0",
+  "HW_VER" : "0.0.1",
   "LOG_INP_WIDTH" : 3,
   "LOG_WGT_WIDTH" : 3,
   "LOG_ACC_WIDTH" : 5,
-  "LOG_OUT_WIDTH" : 3,
   "LOG_BATCH" : 0,
-  "LOG_BLOCK_IN" : 4,
-  "LOG_BLOCK_OUT" : 4,
+  "LOG_BLOCK" : 4,
   "LOG_UOP_BUFF_SIZE" : 15,
-  "LOG_INP_BUFF_SIZE" : 15,
+  "LOG_INP_BUFF_SIZE" :15,
   "LOG_WGT_BUFF_SIZE" : 18,
   "LOG_ACC_BUFF_SIZE" : 17
 }
diff --git a/vta/config/ultra96_sample.json b/vta/config/ultra96_sample.json
new file mode 100644
index 000000000000..013420cff52e
--- /dev/null
+++ b/vta/config/ultra96_sample.json
@@ -0,0 +1,13 @@
+{
+  "TARGET" : "ultra96",
+  "HW_VER" : "0.0.1",
+  "LOG_INP_WIDTH" : 3,
+  "LOG_WGT_WIDTH" : 3,
+  "LOG_ACC_WIDTH" : 5,
+  "LOG_BATCH" : 0,
+  "LOG_BLOCK" : 4,
+  "LOG_UOP_BUFF_SIZE" : 15,
+  "LOG_INP_BUFF_SIZE" :15,
+  "LOG_WGT_BUFF_SIZE" : 18,
+  "LOG_ACC_BUFF_SIZE" : 17
+}
diff --git a/vta/config/vta_config.json b/vta/config/vta_config.json
index 602af0126816..0591bb486143 100644
--- a/vta/config/vta_config.json
+++ b/vta/config/vta_config.json
@@ -1,15 +1,11 @@
 {
   "TARGET" : "sim",
-  "HW_FREQ" : 100,
-  "HW_CLK_TARGET" : 7,
-  "HW_VER" : "0.0.0",
+  "HW_VER" : "0.0.1",
   "LOG_INP_WIDTH" : 3,
   "LOG_WGT_WIDTH" : 3,
   "LOG_ACC_WIDTH" : 5,
-  "LOG_OUT_WIDTH" : 3,
   "LOG_BATCH" : 0,
-  "LOG_BLOCK_IN" : 4,
-  "LOG_BLOCK_OUT" : 4,
+  "LOG_BLOCK" : 4,
   "LOG_UOP_BUFF_SIZE" : 15,
   "LOG_INP_BUFF_SIZE" : 15,
   "LOG_WGT_BUFF_SIZE" : 18,
diff --git a/vta/config/vta_config.py b/vta/config/vta_config.py
index ea07e5a7770c..b925bf5fe4df 100644
--- a/vta/config/vta_config.py
+++ b/vta/config/vta_config.py
@@ -30,7 +30,6 @@ def get_pkg_config(cfg):
     PkgConfig = libpkg["PkgConfig"]
     return PkgConfig(cfg, proj_root)
 
-
 def main():
     """Main funciton"""
     parser = argparse.ArgumentParser()
@@ -45,7 +44,7 @@ def main():
     parser.add_argument("--update", action="store_true",
                         help="Print out the json option.")
     parser.add_argument("--ldflags", action="store_true",
-                        help="print the cflags")
+                        help="print the ldflags")
     parser.add_argument("--cfg-json", action="store_true",
                         help="print all the config json")
     parser.add_argument("--save-cfg-json", type=str, default="",
@@ -54,33 +53,51 @@ def main():
                         help="print the target")
     parser.add_argument("--cfg-str", action="store_true",
                         help="print the configuration string")
-    parser.add_argument("--get-inpwidth", action="store_true",
-                        help="returns log of input bitwidth")
-    parser.add_argument("--get-wgtwidth", action="store_true",
-                        help="returns log of weight bitwidth")
-    parser.add_argument("--get-accwidth", action="store_true",
-                        help="returns log of accum bitwidth")
-    parser.add_argument("--get-outwidth", action="store_true",
-                        help="returns log of output bitwidth")
-    parser.add_argument("--get-batch", action="store_true",
-                        help="returns log of tensor batch dimension")
-    parser.add_argument("--get-blockin", action="store_true",
-                        help="returns log of tensor block in dimension")
-    parser.add_argument("--get-blockout", action="store_true",
-                        help="returns log of tensor block out dimension")
-    parser.add_argument("--get-uopbuffsize", action="store_true",
-                        help="returns log of micro-op buffer size in B")
-    parser.add_argument("--get-inpbuffsize", action="store_true",
-                        help="returns log of input buffer size in B")
-    parser.add_argument("--get-wgtbuffsize", action="store_true",
-                        help="returns log of weight buffer size in B")
-    parser.add_argument("--get-accbuffsize", action="store_true",
-                        help="returns log of accum buffer size in B")
-    parser.add_argument("--get-outbuffsize", action="store_true",
-                        help="returns log of output buffer size in B")
-    parser.add_argument("--get-fpgafreq", action="store_true",
+    parser.add_argument("--get-inp-mem-banks", action="store_true",
+                        help="returns number of input memory banks")
+    parser.add_argument("--get-inp-mem-width", action="store_true",
+                        help="returns input memory read/write port width")
+    parser.add_argument("--get-inp-mem-depth", action="store_true",
+                        help="returns input memory depth")
+    parser.add_argument("--get-inp-mem-axi-ratio", action="store_true",
+                        help="returns ratio between input element width and axi width")
+    parser.add_argument("--get-wgt-mem-banks", action="store_true",
+                        help="returns number of weight memory banks")
+    parser.add_argument("--get-wgt-mem-width", action="store_true",
+                        help="returns weight memory read/write port width")
+    parser.add_argument("--get-wgt-mem-depth", action="store_true",
+                        help="returns weight memory depth")
+    parser.add_argument("--get-wgt-mem-axi-ratio", action="store_true",
+                        help="returns ratio between weight element width and axi width")
+    parser.add_argument("--get-out-mem-banks", action="store_true",
+                        help="returns number of output memory banks")
+    parser.add_argument("--get-out-mem-width", action="store_true",
+                        help="returns output memory read/write port width")
+    parser.add_argument("--get-out-mem-depth", action="store_true",
+                        help="returns output memory depth")
+    parser.add_argument("--get-out-mem-axi-ratio", action="store_true",
+                        help="returns ratio between output element width and axi width")
+    parser.add_argument("--get-axi-cache-bits", action="store_true",
+                        help="returns AXI system ARCACHE/AWCACHE hardcoded bit value")
+    parser.add_argument("--get-axi-prot-bits", action="store_true",
+                        help="returns AXI system ARPROT/AWPROT hardcoded bit value")
+    parser.add_argument("--get-ip-reg-map-range", action="store_true",
+                        help="returns ip register map address range")
+    parser.add_argument("--get-fetch-base-addr", action="store_true",
+                        help="returns fetch module base address")
+    parser.add_argument("--get-load-base-addr", action="store_true",
+                        help="returns load module base address")
+    parser.add_argument("--get-compute-base-addr", action="store_true",
+                        help="returns compute module base address")
+    parser.add_argument("--get-store-base-addr", action="store_true",
+                        help="returns store module base address")
+    parser.add_argument("--get-fpga-dev", action="store_true",
+                        help="returns FPGA device target")
+    parser.add_argument("--get-fpga-family", action="store_true",
+                        help="returns FPGA device family")
+    parser.add_argument("--get-fpga-freq", action="store_true",
                         help="returns FPGA frequency")
-    parser.add_argument("--get-fpgaper", action="store_true",
+    parser.add_argument("--get-fpga-per", action="store_true",
                         help="returns HLS target clock period")
     args = parser.parse_args()
 
@@ -92,8 +109,6 @@ def main():
         os.path.abspath(os.path.expanduser(__file__)))
     proj_root = os.path.abspath(os.path.join(curr_path, "../../"))
     path_list = [
-        os.path.join(proj_root, "vta_config.json"),
-        os.path.join(proj_root, "build", "vta_config.json"),
         os.path.join(proj_root, "vta/config/vta_config.json")
     ]
     if args.use_cfg:
@@ -102,14 +117,11 @@ def main():
     if not ok_path_list:
         raise RuntimeError("Cannot find config in %s" % str(path_list))
     cfg = json.load(open(ok_path_list[0]))
-    cfg["LOG_OUT_BUFF_SIZE"] = (
-        cfg["LOG_ACC_BUFF_SIZE"] +
-        cfg["LOG_OUT_WIDTH"] -
-        cfg["LOG_ACC_WIDTH"])
+
     pkg = get_pkg_config(cfg)
 
     if args.target:
-        print(pkg.target)
+        print(pkg.TARGET)
 
     if args.defs:
         print(" ".join(pkg.macro_defs))
@@ -119,8 +131,10 @@ def main():
 
     if args.cflags:
         cflags_str = " ".join(pkg.cflags)
-        if cfg["TARGET"] == "pynq":
+        if pkg.TARGET == "pynq":
             cflags_str += " -DVTA_TARGET_PYNQ"
+        if pkg.TARGET == "ultra96":
+            cflags_str += " -DVTA_TARGET_ULTRA96"
         print(cflags_str)
 
     if args.ldflags:
@@ -134,63 +148,76 @@ def main():
             fo.write(pkg.cfg_json)
 
     if args.cfg_str:
-        # Needs to match the BITSTREAM string in python/vta/environment.py
-        cfg_str = "{}x{}x{}_{}bx{}b_{}_{}_{}_{}_{}MHz_{}ns_v{}".format(
-            (1 << cfg["LOG_BATCH"]),
-            (1 << cfg["LOG_BLOCK_IN"]),
-            (1 << cfg["LOG_BLOCK_OUT"]),
-            (1 << cfg["LOG_INP_WIDTH"]),
-            (1 << cfg["LOG_WGT_WIDTH"]),
-            cfg["LOG_UOP_BUFF_SIZE"],
-            cfg["LOG_INP_BUFF_SIZE"],
-            cfg["LOG_WGT_BUFF_SIZE"],
-            cfg["LOG_ACC_BUFF_SIZE"],
-            cfg["HW_FREQ"],
-            cfg["HW_CLK_TARGET"],
-            cfg["HW_VER"].replace('.', '_'))
-        print(cfg_str)
+        print(pkg.TARGET + "_" + pkg.bitstream)
+
+    if args.get_inp_mem_banks:
+        print(pkg.inp_mem_banks)
+
+    if args.get_inp_mem_width:
+        print(pkg.inp_mem_width)
+
+    if args.get_inp_mem_depth:
+        print(pkg.inp_mem_depth)
+
+    if args.get_inp_mem_axi_ratio:
+        print(pkg.inp_mem_axi_ratio)
+
+    if args.get_wgt_mem_banks:
+        print(pkg.wgt_mem_banks)
+
+    if args.get_wgt_mem_width:
+        print(pkg.wgt_mem_width)
+
+    if args.get_wgt_mem_depth:
+        print(pkg.wgt_mem_depth)
+
+    if args.get_wgt_mem_axi_ratio:
+        print(pkg.wgt_mem_axi_ratio)
+
+    if args.get_out_mem_banks:
+        print(pkg.out_mem_banks)
 
-    if args.get_inpwidth:
-        print(cfg["LOG_INP_WIDTH"])
+    if args.get_out_mem_width:
+        print(pkg.out_mem_width)
 
-    if args.get_wgtwidth:
-        print(cfg["LOG_WGT_WIDTH"])
+    if args.get_out_mem_depth:
+        print(pkg.out_mem_depth)
 
-    if args.get_accwidth:
-        print(cfg["LOG_ACC_WIDTH"])
+    if args.get_out_mem_axi_ratio:
+        print(pkg.out_mem_axi_ratio)
 
-    if args.get_outwidth:
-        print(cfg["LOG_OUT_WIDTH"])
+    if args.get_axi_cache_bits:
+        print(pkg.axi_cache_bits)
 
-    if args.get_batch:
-        print(cfg["LOG_BATCH"])
+    if args.get_axi_prot_bits:
+        print(pkg.axi_prot_bits)
 
-    if args.get_blockin:
-        print(cfg["LOG_BLOCK_IN"])
+    if args.get_ip_reg_map_range:
+        print(pkg.ip_reg_map_range)
 
-    if args.get_blockout:
-        print(cfg["LOG_BLOCK_OUT"])
+    if args.get_fetch_base_addr:
+        print(pkg.fetch_base_addr)
 
-    if args.get_uopbuffsize:
-        print(cfg["LOG_UOP_BUFF_SIZE"])
+    if args.get_load_base_addr:
+        print(pkg.load_base_addr)
 
-    if args.get_inpbuffsize:
-        print(cfg["LOG_INP_BUFF_SIZE"])
+    if args.get_compute_base_addr:
+        print(pkg.compute_base_addr)
 
-    if args.get_wgtbuffsize:
-        print(cfg["LOG_WGT_BUFF_SIZE"])
+    if args.get_store_base_addr:
+        print(pkg.store_base_addr)
 
-    if args.get_outbuffsize:
-        print(cfg["LOG_OUT_BUFF_SIZE"])
+    if args.get_fpga_dev:
+        print(pkg.fpga_device)
 
-    if args.get_accbuffsize:
-        print(cfg["LOG_ACC_BUFF_SIZE"])
+    if args.get_fpga_family:
+        print(pkg.fpga_family)
 
-    if args.get_fpgafreq:
-        print(cfg["HW_FREQ"])
+    if args.get_fpga_freq:
+        print(pkg.fpga_freq)
 
-    if args.get_fpgaper:
-        print(cfg["HW_CLK_TARGET"])
+    if args.get_fpga_per:
+        print(pkg.fpga_per)
 
 if __name__ == "__main__":
     main()
diff --git a/vta/hardware/xilinx/Makefile b/vta/hardware/xilinx/Makefile
index af13cdc166f8..77d5d4413f6c 100644
--- a/vta/hardware/xilinx/Makefile
+++ b/vta/hardware/xilinx/Makefile
@@ -17,81 +17,30 @@
 
 # Directories
 ROOTDIR = $(CURDIR)
-BUILD_NAME = build
-BUILD_DIR = $(ROOTDIR)/../../$(BUILD_NAME)/hardware/xilinx
-SCRIPT_DIR = $(ROOTDIR)/scripts
-SRC_DIR = $(ROOTDIR)/src
-SIM_DIR = $(ROOTDIR)/sim
-TEST_DIR = $(ROOTDIR)/../../tests/hardware/common
-INCLUDE_DIR = $(ROOTDIR)/../../include
+VTA_DIR = $(CURDIR)/../..
+BUILD_DIR = $(VTA_DIR)/build/hardware/xilinx
+SCRIPT_DIR = $(CURDIR)/scripts
+SRC_DIR = $(CURDIR)/src
 
 # Executables
 VIVADO_HLS = vivado_hls
 VIVADO = vivado
-HSI = hsi
-
-# HLS mode
-MODE = skip_sim
-# Debug flag
-DEBUG = false
-# SLURM
-SLURM = false
-# Prevent generation of DSP
-NO_DSP = false
-# Prevent generation of ALU
-NO_ALU = false
 
 # Process VTA JSON config
-VTA_CONFIG = python $(CURDIR)/../../config/vta_config.py
-CFLAGS := $(shell ${VTA_CONFIG} --cflags)
-VTA_TARGET := $(shell ${VTA_CONFIG} --target)
-
-#---------------------
-# VTA Parameters
-#--------------------
-VTA_INP_WIDTH := $(shell ${VTA_CONFIG} --get-inpwidth)
-VTA_WGT_WIDTH := $(shell ${VTA_CONFIG} --get-wgtwidth)
-VTA_ACC_WIDTH := $(shell ${VTA_CONFIG} --get-accwidth)
-VTA_OUT_WIDTH := $(shell ${VTA_CONFIG} --get-outwidth)
-VTA_BATCH := $(shell ${VTA_CONFIG} --get-batch)
-VTA_IN_BLOCK := $(shell ${VTA_CONFIG} --get-blockin)
-VTA_OUT_BLOCK := $(shell ${VTA_CONFIG} --get-blockout)
-VTA_UOP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-uopbuffsize)
-VTA_INP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-inpbuffsize)
-VTA_WGT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-wgtbuffsize)
-VTA_ACC_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-accbuffsize)
-VTA_OUT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-outbuffsize)
-
-#---------------------
-# FPGA Parameters
-#--------------------
-VTA_CLOCK_FREQ = $(shell ${VTA_CONFIG} --get-fpgafreq)
-VTA_TARGET_PER = $(shell ${VTA_CONFIG} --get-fpgaper)
-
-#---------------------
-# Compilation parameters
-#--------------------
-
-#  Number of threads during compilation
-VTA_HW_COMP_THREADS = 8
+VTA_CONFIG := $(CURDIR)/../../config/vta_config.py
 
 # Derive config name
-CONF = $(shell ${VTA_CONFIG} --cfg-str)
-IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF)
-HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF)
-
-ifeq ($(SLURM), true)
-	IP_BUILD_PATH = /scratch/hls/$(CONF)
-	HW_BUILD_PATH = /scratch/vivado/$(CONF)
-endif
+CONF := $(shell python ${VTA_CONFIG} --cfg-str)
+IP_BUILD_PATH := $(BUILD_DIR)/hls/$(CONF)
+HW_BUILD_PATH := $(BUILD_DIR)/vivado/$(CONF)
 
 # IP file path
-IP_PATH = $(BUILD_DIR)/hls/$(CONF)/solution0/impl/ip/xilinx_com_hls_vta_1_0.zip
+IP_PATH := $(BUILD_DIR)/hls/$(CONF)/vta_compute/soln/impl/ip/xilinx_com_hls_compute_1_0.zip
 
 # Bitstream file path
-BIT_PATH = $(BUILD_DIR)/vivado/$(CONF)/export/$(CONF).bit
+BIT_PATH := $(BUILD_DIR)/vivado/$(CONF)/export/$(CONF).bit
 
-.PHONY: all ip bit bsp clean clean_all
+.PHONY: all ip bit clean clean_all
 
 all: bit
 ip: $(IP_PATH)
@@ -100,37 +49,24 @@ bit: $(BIT_PATH)
 $(IP_PATH): $(SRC_DIR)/*
 	mkdir -p $(IP_BUILD_PATH)
 	cd $(IP_BUILD_PATH) && \
-		$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
-		-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) \
-		$(MODE) $(DEBUG) $(NO_DSP) $(NO_ALU) $(VTA_TARGET_PER) \
-		$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_ACC_WIDTH) $(VTA_OUT_WIDTH) \
-		$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
-		$(VTA_UOP_BUFF_SIZE) $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) \
-		$(VTA_ACC_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE)
-ifeq ($(SLURM), true)
-	mkdir -p $(BUILD_DIR)/hls
-	mv $(IP_BUILD_PATH) $(BUILD_DIR)/hls/.
-endif
+		$(VIVADO_HLS) \
+		-f $(SCRIPT_DIR)/hls.tcl \
+		-tclargs \
+			$(VTA_DIR) \
+			${VTA_CONFIG}
 
 $(BIT_PATH): $(IP_PATH)
 	mkdir -p $(HW_BUILD_PATH)
 	cd $(HW_BUILD_PATH) && \
-		$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \
-		-tclargs $(BUILD_DIR)/hls/$(CONF) $(VTA_HW_COMP_THREADS) $(VTA_CLOCK_FREQ) \
-		$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_OUT_WIDTH) \
-		$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
-		$(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE)
-ifeq ($(SLURM), true)
-	mkdir -p $(BUILD_DIR)/vivado
-	mv $(HW_BUILD_PATH) $(BUILD_DIR)/vivado/.
-endif
-
-bsp: $(BIT_PATH)
-	cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog
-	cd $(HW_BUILD_PATH)/bsp && make
+		$(VIVADO) \
+		-mode tcl \
+		-source $(SCRIPT_DIR)/vivado.tcl \
+		-tclargs \
+			$(BUILD_DIR)/hls/$(CONF) \
+			${VTA_CONFIG}
 
 clean:
-	rm -rf *.out *.log *.sb figures
+	rm -rf *.out *.log
 
 cleanall: clean
 	rm -rf $(BUILD_DIR)
diff --git a/vta/hardware/xilinx/scripts/hls.tcl b/vta/hardware/xilinx/scripts/hls.tcl
index 3d308bc58d25..f371d905113b 100644
--- a/vta/hardware/xilinx/scripts/hls.tcl
+++ b/vta/hardware/xilinx/scripts/hls.tcl
@@ -14,220 +14,125 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-#
-#  Copyright (c) 2018 by Contributors
-#  file: hls.tcl
-#  brief: HLS generation script.
-#
 
 # Command line arguments:
-# Arg 1: path to design sources
-# Arg 2: path to sim sources
-# Arg 3: path to test sources
-# Arg 4: path to include sources
-# Arg 5: mode
-# Arg 6: debug
-# Arg 7: no_dsp
-# Arg 8: no_alu
-# Arg 9: target clock period
-# Arg 10: input type width (log)
-# Arg 11: weight type width (log)
-# Arg 12: accum type width (log)
-# Arg 13: output type width (log)
-# Arg 14: batch size (log)
-# Arg 15: in block size (log)
-# Arg 16: out block size (log)
-# Arg 17: uop buffer size in B (log)
-# Arg 18: inp buffer size in B (log)
-# Arg 19: wgt buffer size in B (log)
-# Arg 20: acc buffer size in B (log)
-# Arg 21: out buffer size in B (log)
-
-if { [llength $argv] eq 23 } {
-	set src_dir [lindex $argv 2]
-	set sim_dir [lindex $argv 3]
-	set test_dir [lindex $argv 4]
-	set include_dir [lindex $argv 5]
-	set mode [lindex $argv 6]
-	set debug [lindex $argv 7]
-	set no_dsp [lindex $argv 8]
-	set no_alu [lindex $argv 9]
-	set target_period [lindex $argv 10]
-	set inp_width [lindex $argv 11]
-	set wgt_width [lindex $argv 12]
-	set acc_width [lindex $argv 13]
-	set out_width [lindex $argv 14]
-	set batch [lindex $argv 15]
-	set block_in [lindex $argv 16]
-	set block_out [lindex $argv 17]
-	set uop_buff_size [lindex $argv 18]
-	set inp_buff_size [lindex $argv 19]
-	set wgt_buff_size [lindex $argv 20]
-	set acc_buff_size [lindex $argv 21]
-	set out_buff_size [lindex $argv 22]
+# Arg 1: path to vta root
+# Arg 2: path of config param script
+
+if { [llength $argv] eq 4 } {
+    set root_dir        [lindex $argv 2]
+    set vta_config      [lindex $argv 3]
 } else {
-	set src_dir "../src"
-	set sim_dir "../sim"
-	set test_dir "../../src/test"
-	set include_dir "../../include"
-	set mode "all"
-	set debug "false"
-	set no_dsp "true"
-	set no_alu "false"
-	set target_period 10
-	set inp_width 3
-	set wgt_width 3
-	set acc_width 5
-	set out_width 3
-	set batch 1
-	set block_in 4
-	set block_out 4
-	set uop_buff_size 15
-	set inp_buff_size 15
-	set wgt_buff_size 15
-	set acc_buff_size 17
-	set out_buff_size 15
-	exit
+    puts "Not enough arguments provided!"
+    exit
 }
 
+# Derive paths
+set src_dir "$root_dir/hardware/xilinx/src"
+set sim_dir "$root_dir/hardware/xilinx/sim"
+set test_dir "$root_dir/tests/hardware/common"
+
+# C define flags that we want to pass to the compiler
+set cflags [exec python $vta_config --cflags]
+
+# Get the VTA configuration paramters
+set ::device        [exec python $vta_config --get-fpga-dev]
+set ::period        [exec python $vta_config --get-fpga-per]
+
+# Get the VTA SRAM reshape/partition factors to get all memories
+# to be of the same axi width.
+set ::inp_reshape_factor    [exec python $vta_config --get-inp-mem-axi-ratio]
+set ::inp_partition_factor  [exec python $vta_config --get-inp-mem-banks]
+set ::wgt_reshape_factor    [exec python $vta_config --get-wgt-mem-axi-ratio]
+set ::wgt_partition_factor  [exec python $vta_config --get-wgt-mem-banks]
+set ::out_reshape_factor    [exec python $vta_config --get-out-mem-axi-ratio]
+set ::out_partition_factor  [exec python $vta_config --get-out-mem-banks]
+
+
 # Initializes the HLS design and sets HLS pragmas for memory partitioning.
 # This is necessary because of a Vivado restriction that doesn't allow for
 # buses wider than 1024 bits.
-proc init_design {per inp_width wgt_width out_width batch block_in block_out} {
-
-	# Set device number
-	set_part {xc7z020clg484-1}
-
-	# Set the clock frequency
-	create_clock -period $per -name default
-
-	# Set input partition factor to (INP_VECTOR_WIDTH*BATCH/1024)
-	set inp_partition_factor [expr {(1 << ($inp_width + $block_in + $batch)) / 1024}]
-	if {$inp_partition_factor == 0} {
-		set_directive_array_reshape -type complete -dim 2 "load" inp_mem
-		set_directive_array_reshape -type complete -dim 2 "compute" inp_mem
-	} else {
-		# Set input reshaping factor below to (1024/INP_VECTOR_WIDTH)
-		set inp_reshape_factor [expr {1024 / (1 << ($inp_width + $block_in))}]
-		set_directive_array_partition -type block -factor $inp_partition_factor -dim 2 "load" inp_mem
-		set_directive_array_partition -type block -factor $inp_partition_factor -dim 2 "compute" inp_mem
-		set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "load" inp_mem
-		set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "compute" inp_mem
-	}
-	# Set weight partition factor to (WGT_VECTOR_WIDTH*BLOCK_OUT/1024)
-	set wgt_partition_factor [expr {(1 << ($wgt_width + $block_in + $block_out)) / 1024}]
-	if {$wgt_partition_factor == 0} {
-		set_directive_array_reshape -type complete -dim 2 "load" wgt_mem
-		set_directive_array_reshape -type complete -dim 2 "compute" wgt_mem
-	} else {
-		# Set weight reshaping factor below to (1024/WGT_VECTOR_WIDTH)
-		set wgt_reshape_factor [expr {1024 / (1 << ($wgt_width + $block_in))}]
-		set_directive_array_partition -type block -factor $wgt_partition_factor -dim 2 "load" wgt_mem
-		set_directive_array_partition -type block -factor $wgt_partition_factor -dim 2 "compute" wgt_mem
-		set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "load" wgt_mem
-		set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "compute" wgt_mem
-	}
-	# Set output partition factor to (OUT_VECTOR_WIDTH*BATCH/1024)
-	set out_partition_factor [expr {(1 << ($out_width + $block_out + $batch)) / 1024}]
-	if {$out_partition_factor == 0} {
-		set_directive_array_reshape -type complete -dim 2 "compute" out_mem
-		set_directive_array_reshape -type complete -dim 2 "store" out_mem
-	} else {
-		# Set output reshaping factor below to (1024/OUT_VECTOR_WIDTH)
-		set out_reshape_factor [expr {1024 / (1 << ($out_width + $block_out))}]
-		set_directive_array_partition -type block -factor $out_partition_factor -dim 2 "compute" out_mem
-		set_directive_array_partition -type block -factor $out_partition_factor -dim 2 "store" out_mem
-		set_directive_array_reshape -type block -factor $out_reshape_factor -dim 2 "compute" out_mem
-		set_directive_array_reshape -type block -factor $out_reshape_factor -dim 2 "store" out_mem
-	}
-}
+proc init_design {} {
 
-# C define flags to pass to compiler
-set cflags "-I $include_dir -I $src_dir -I $test_dir \
-	-DVTA_LOG_WGT_WIDTH=$wgt_width -DVTA_LOG_INP_WIDTH=$inp_width \
-	-DVTA_LOG_ACC_WIDTH=$acc_width -DVTA_LOG_OUT_WIDTH=$out_width \
-	-DVTA_LOG_BATCH=$batch -DVTA_LOG_BLOCK_OUT=$block_out -DVTA_LOG_BLOCK_IN=$block_in \
-	-DVTA_LOG_UOP_BUFF_SIZE=$uop_buff_size -DVTA_LOG_INP_BUFF_SIZE=$inp_buff_size \
-	-DVTA_LOG_WGT_BUFF_SIZE=$wgt_buff_size -DVTA_LOG_ACC_BUFF_SIZE=$acc_buff_size \
-	-DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size"
-if {$debug=="true"} {
-	append cflags " -DVTA_DEBUG=1"
-}
-if {$no_dsp=="true"} {
-	append cflags " -DNO_DSP"
-}
-if {$no_alu=="true"} {
-	append cflags " -DNO_ALU"
+    # Set device id
+    set_part $::device
+
+    # Set the clock frequency
+    create_clock -period $::period -name default
+
+    # HLS pragmas to reshape/partition the input memory read/write port
+    set_directive_array_reshape -type block -factor $::inp_reshape_factor -dim 2 "load" inp_mem
+    set_directive_array_reshape -type block -factor $::inp_reshape_factor -dim 2 "compute" inp_mem
+    if {$::inp_partition_factor > 1} {
+        set_directive_array_partition -type block -factor $::inp_partition_factor -dim 2 "load" inp_mem
+        set_directive_array_partition -type block -factor $::inp_partition_factor -dim 2 "compute" inp_mem
+    }
+    # HLS pragmas to reshape/partition the weight memory read/write port
+    set_directive_array_reshape -type block -factor $::wgt_reshape_factor -dim 2 "load" wgt_mem
+    set_directive_array_reshape -type block -factor $::wgt_reshape_factor -dim 2 "compute" wgt_mem
+    if {$::wgt_partition_factor >1} {
+        set_directive_array_partition -type block -factor $::wgt_partition_factor -dim 2 "load" wgt_mem
+        set_directive_array_partition -type block -factor $::wgt_partition_factor -dim 2 "compute" wgt_mem
+    }
+    # HLS pragmas to reshape/partition the output memory read/write port
+    set_directive_array_reshape -type block -factor $::out_reshape_factor -dim 2 "compute" out_mem
+    set_directive_array_reshape -type block -factor $::out_reshape_factor -dim 2 "store" out_mem
+    if {$::out_partition_factor > 1} {
+        set_directive_array_partition -type block -factor $::out_partition_factor -dim 2 "compute" out_mem
+        set_directive_array_partition -type block -factor $::out_partition_factor -dim 2 "store" out_mem
+    }
 }
 
 # HLS behavioral sim
-if {$mode=="all" || $mode=="sim"} {
-	open_project vta_sim
-	set_top vta
-	add_files $src_dir/vta.cc -cflags $cflags
-	add_files -tb $sim_dir/vta_test.cc -cflags $cflags
-	add_files -tb $test_dir/test_lib.cc -cflags $cflags
-	open_solution "solution0"
-	init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
-	csim_design -clean
-	close_project
-}
+open_project vta_sim
+set_top vta
+add_files $src_dir/vta.cc -cflags $cflags
+add_files -tb $sim_dir/vta_test.cc -cflags $cflags
+add_files -tb $test_dir/test_lib.cc -cflags $cflags
+open_solution "soln"
+init_design
+csim_design -clean
+close_project
 
 # Generate fetch stage
-if {$mode=="all" || $mode=="skip_sim" || $mode=="fetch"} {
-	open_project vta_fetch
-	set_top fetch
-	add_files $src_dir/vta.cc -cflags $cflags
-	open_solution "solution0"
-	init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
-	csynth_design
-	if {$mode=="all" || $mode=="skip_sim"} {
-		export_design -format ip_catalog
-	}
-	close_project
-}
+open_project vta_fetch
+set_top fetch
+add_files $src_dir/vta.cc -cflags $cflags
+open_solution "soln"
+init_design
+csynth_design
+export_design -format ip_catalog
+close_project
 
 # Generate load stage
-if {$mode=="all" || $mode=="skip_sim" || $mode=="load"} {
-	open_project vta_load
-	set_top load
-	add_files $src_dir/vta.cc -cflags $cflags
-	open_solution "solution0"
-	init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
-	csynth_design
-	if {$mode=="all" || $mode=="skip_sim"} {
-		export_design -format ip_catalog
-	}
-	close_project
-}
+open_project vta_load
+set_top load
+add_files $src_dir/vta.cc -cflags $cflags
+open_solution "soln"
+init_design
+csynth_design
+export_design -format ip_catalog
+close_project
 
 # Generate compute stage
-if {$mode=="all" || $mode=="skip_sim" || $mode=="compute"} {
-	open_project vta_compute
-	set_top compute
-	add_files $src_dir/vta.cc -cflags $cflags
-	open_solution "solution0"
-	init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
-	csynth_design
-	if {$mode=="all" || $mode=="skip_sim"} {
-		export_design -format ip_catalog
-	}
-	close_project
-}
+open_project vta_compute
+set_top compute
+add_files $src_dir/vta.cc -cflags $cflags
+open_solution "soln"
+init_design
+csynth_design
+export_design -format ip_catalog
+close_project
 
 # Generate store stage
-if {$mode=="all" || $mode=="skip_sim" || $mode=="store"} {
-	open_project vta_store
-	set_top store
-	add_files $src_dir/vta.cc -cflags $cflags
-	open_solution "solution0"
-	init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
-	csynth_design
-	if {$mode=="all" || $mode=="skip_sim"} {
-		export_design -format ip_catalog
-	}
-	close_project
-}
+open_project vta_store
+set_top store
+add_files $src_dir/vta.cc -cflags $cflags
+open_solution "soln"
+init_design
+csynth_design
+export_design -format ip_catalog
+close_project
 
 exit
 
diff --git a/vta/hardware/xilinx/scripts/vivado.tcl b/vta/hardware/xilinx/scripts/vivado.tcl
index 9cfa10ea7482..3be575749c27 100644
--- a/vta/hardware/xilinx/scripts/vivado.tcl
+++ b/vta/hardware/xilinx/scripts/vivado.tcl
@@ -14,107 +14,67 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-#
-#  Copyright (c) 2018 by Xilinx, Contributors
-#  file: vivado.tcl
-#  brief: Vivado compilation script. Partially automatically generated
-#   by Vivado.
-#
 
 # Check if script is running in correct Vivado version.
-set scripts_vivado_version 2018.2
+set scripts_vivado_version 2018.3
 set current_vivado_version [version -short]
 
 if { [string first $scripts_vivado_version $current_vivado_version] == -1 } {
    puts ""
    catch {common::send_msg_id "BD_TCL-109" "ERROR" "This script was generated using Vivado \
-    <$scripts_vivado_version> and is being run in <$current_vivado_version> of Vivado. \
-    Please run the script in Vivado <$scripts_vivado_version> then open the design in Vivado \
-    <$current_vivado_version>. Upgrade the design by running \"Tools => Report => Report IP \
-    Status...\", then run write_bd_tcl to create an updated script."}
-
+    <$scripts_vivado_version> and is being run in <$current_vivado_version> of Vivado."}
    return 1
 }
 
 # Parse argument list, derive the clock to utilize
-set clock_id 0
-if { [llength $argv] eq 12 } {
-  set ip_path [lindex $argv 0]
-  set num_threads [lindex $argv 1]
-  set clock_freq [lindex $argv 2]
-  set inp_width [expr 1 << [lindex $argv 3]]
-  set wgt_width [expr 1 << [lindex $argv 4]]
-  set out_width [expr 1 << [lindex $argv 5]]
-  set batch [expr 1 << [lindex $argv 6]]
-  set out_block [expr 1 << [lindex $argv 7]]
-  set in_block [expr 1 << [lindex $argv 8]]
-  set inp_mem_size [expr 1 << [lindex $argv 9]]
-  set wgt_mem_size [expr 1 << [lindex $argv 10]]
-  set out_mem_size [expr 1 << [lindex $argv 11]]
-  if {$clock_freq eq 100} {
-    set clock_id 0
-    puts "Setting clock frequency to 100MHz"
-  } elseif {$clock_freq eq 142} {
-    set clock_id 1
-    puts "Setting clock frequency to 142MHz"
-  } elseif {$clock_freq eq 167} {
-    set clock_id 3
-    puts "Setting clock frequency to 167MHz"
-  } elseif {$clock_freq eq 200} {
-    set clock_id 2
-    puts "Setting clock frequency to 200MHz"
-  } else {
-    set clock_id 0
-    puts "Unrecognized clock frequency, setting clock to 100MHz"
-  }
+if { [llength $argv] eq 2 } {
+  set ip_path     [lindex $argv 0]
+  set vta_config  [lindex $argv 1]
 } else {
-  puts "Arg list incomplete: <path to ip dir> <num threads> <clock freq> \
-    <inp width> <wgt_width> <out_width> <batch> <batch> <out_block> <in_block
-    <inp_mem_size> <wgt_mem_size> <out_mem_size>"
+  puts "Arg list incomplete: <path to ip dir> <path to vta_config.py>"
   return 1
 }
 
-# Derive input mem parameters
-set inp_mem_width [expr $inp_width * $batch * $in_block]
-set inp_bus_width 1024
-set inp_part [expr $inp_mem_width / $inp_bus_width]
-if {[expr $inp_part == 0]} {
-  set inp_part 1
-  set inp_bus_width $inp_mem_width
-}
-set inp_mem_depth [expr $inp_mem_size * 8 / ($inp_mem_width * $inp_part)]
-
-# Derive weight mem parameters
-set wgt_mem_width [expr $wgt_width * $out_block * $in_block]
-set wgt_bus_width 1024
-set wgt_part [expr $wgt_mem_width / $wgt_bus_width]
-if {[expr $wgt_part == 0]} {
-  set wgt_part 1
-  set wgt_bus_width $wgt_mem_width
-}
-set wgt_mem_depth [expr $wgt_mem_size * 8 / ($wgt_mem_width * $wgt_part)]
-
-# Derive output mem parameters
-set out_mem_width [expr $out_width * $batch * $out_block]
-set out_bus_width 1024
-set out_part [expr $out_mem_width / $out_bus_width]
-if {[expr $out_part == 0]} {
-  set out_part 1
-  set out_bus_width $out_mem_width
-}
-set out_mem_depth [expr $out_mem_size * 8 / ($out_mem_width * $out_part)]
-
-# User defined paths
+# Get the VTA configuration paramters
+set target            [exec python $vta_config --target]
+set device_family     [exec python $vta_config --get-fpga-family]
+set clock_freq        [exec python $vta_config --get-fpga-freq]
+
+# SRAM dimensions
+set inp_part          [exec python $vta_config --get-inp-mem-banks]
+set inp_mem_width     [exec python $vta_config --get-inp-mem-width]
+set inp_mem_depth     [exec python $vta_config --get-inp-mem-depth]
+set wgt_part          [exec python $vta_config --get-wgt-mem-banks]
+set wgt_mem_width     [exec python $vta_config --get-wgt-mem-width]
+set wgt_mem_depth     [exec python $vta_config --get-wgt-mem-depth]
+set out_part          [exec python $vta_config --get-out-mem-banks]
+set out_mem_width     [exec python $vta_config --get-out-mem-width]
+set out_mem_depth     [exec python $vta_config --get-out-mem-depth]
+
+# AXI bus signals
+set axi_cache         [exec python $vta_config --get-axi-cache-bits]
+set axi_prot          [exec python $vta_config --get-axi-prot-bits]
+
+# Address map
+set ip_reg_map_range  [exec python $vta_config --get-ip-reg-map-range]
+set fetch_base_addr   [exec python $vta_config --get-fetch-base-addr]
+set load_base_addr    [exec python $vta_config --get-load-base-addr]
+set compute_base_addr [exec python $vta_config --get-compute-base-addr]
+set store_base_addr   [exec python $vta_config --get-store-base-addr]
+
+# Paths to IP library of VTA modules
 set proj_name vta
+set design_name $proj_name
 set proj_path "."
 set ip_lib "ip_lib"
-set fetch_ip "${ip_path}/vta_fetch/solution0/impl/ip/xilinx_com_hls_fetch_1_0.zip"
-set load_ip "${ip_path}/vta_load/solution0/impl/ip/xilinx_com_hls_load_1_0.zip"
-set compute_ip "${ip_path}/vta_compute/solution0/impl/ip/xilinx_com_hls_compute_1_0.zip"
-set store_ip "${ip_path}/vta_store/solution0/impl/ip/xilinx_com_hls_store_1_0.zip"
+set fetch_ip "${ip_path}/vta_fetch/soln/impl/ip/xilinx_com_hls_fetch_1_0.zip"
+set load_ip "${ip_path}/vta_load/soln/impl/ip/xilinx_com_hls_load_1_0.zip"
+set compute_ip "${ip_path}/vta_compute/soln/impl/ip/xilinx_com_hls_compute_1_0.zip"
+set store_ip "${ip_path}/vta_store/soln/impl/ip/xilinx_com_hls_store_1_0.zip"
 
 # Create custom project
-create_project -force $proj_name $proj_path -part xc7z020clg484-1
+set device [exec python $vta_config --get-fpga-dev]
+create_project -force $proj_name $proj_path -part $device
 
 # Update IP repository with generated IP
 file mkdir $ip_lib
@@ -125,810 +85,334 @@ update_ip_catalog -add_ip $load_ip -repo_path $ip_lib
 update_ip_catalog -add_ip $compute_ip -repo_path $ip_lib
 update_ip_catalog -add_ip $store_ip -repo_path $ip_lib
 
-# CHANGE DESIGN NAME HERE
-set design_name $proj_name
-
-# Creating design if needed
-set errMsg ""
-set nRet 0
-
-set cur_design [current_bd_design -quiet]
-set list_cells [get_bd_cells -quiet]
-
-if { ${design_name} eq "" } {
-   # USE CASES:
-   #    1) Design_name not set
-
-   set errMsg "Please set the variable <design_name> to a non-empty value."
-   set nRet 1
-
-} elseif { ${cur_design} ne "" && ${list_cells} eq "" } {
-   # USE CASES:
-   #    2): Current design opened AND is empty AND names same.
-   #    3): Current design opened AND is empty AND names diff; design_name NOT in project.
-   #    4): Current design opened AND is empty AND names diff; design_name exists in project.
-
-   if { $cur_design ne $design_name } {
-      common::send_msg_id "BD_TCL-001" "INFO" "Changing value of <design_name> from <$design_name> \
-        to <$cur_design> since current design is empty."
-      set design_name [get_property NAME $cur_design]
-   }
-   common::send_msg_id "BD_TCL-002" "INFO" "Constructing design in IPI design <$cur_design>..."
-
-} elseif { ${cur_design} ne "" && $list_cells ne "" && $cur_design eq $design_name } {
-   # USE CASES:
-   #    5) Current design opened AND has components AND same names.
-
-   set errMsg "Design <$design_name> already exists in your project, please set the variable \
-    <design_name> to another value."
-   set nRet 1
-} elseif { [get_files -quiet ${design_name}.bd] ne "" } {
-   # USE CASES:
-   #    6) Current opened design, has components, but diff names, design_name exists in project.
-   #    7) No opened design, design_name exists in project.
-
-   set errMsg "Design <$design_name> already exists in your project, please set the variable \
-    <design_name> to another value."
-   set nRet 2
-
-} else {
-   # USE CASES:
-   #    8) No opened design, design_name not in project.
-   #    9) Current opened design, has components, but diff names, design_name not in project.
-
-   common::send_msg_id "BD_TCL-003" "INFO" "Currently there is no design <$design_name> in \
-    project, so creating one..."
-
-   create_bd_design $design_name
-
-   common::send_msg_id "BD_TCL-004" "INFO" "Making design <$design_name> as current_bd_design."
-   current_bd_design $design_name
-
-}
-
-common::send_msg_id "BD_TCL-005" "INFO" "Currently the variable <design_name> is equal \
-  to \"$design_name\"."
-
-if { $nRet != 0 } {
-   catch {common::send_msg_id "BD_TCL-114" "ERROR" $errMsg}
-   return $nRet
-}
 
 ##################################################################
-# DESIGN PROCs
+# CONFIGURE BLOCK DIAGRAM DESIGN
 ##################################################################
 
+# Create bd design
+create_bd_design $design_name
+current_bd_design $design_name
 
-
-# Procedure to create entire design; Provide argument to make
-# procedure reusable. If parentCell is "", will use root.
-proc create_root_design { parentCell clk inp_part wgt_part out_part inp_bus_width inp_mem_depth wgt_bus_width wgt_mem_depth out_bus_width out_mem_depth} {
-
-  variable script_folder
-
-  if { $parentCell eq "" } {
-     set parentCell [get_bd_cells /]
-  }
-
-  # Get object for parentCell
-  set parentObj [get_bd_cells $parentCell]
-  if { $parentObj == "" } {
-     catch {common::send_msg_id "BD_TCL-100" "ERROR" "Unable to find parent cell <$parentCell>!"}
-     return
-  }
-
-  # Make sure parentObj is hier blk
-  set parentType [get_property TYPE $parentObj]
-  if { $parentType ne "hier" } {
-     catch {common::send_msg_id "BD_TCL-101" "ERROR" "Parent <$parentObj> has TYPE = \
-      <$parentType>. Expected to be <hier>."}
-     return
-  }
-
-  # Save current instance; Restore later
-  set oldCurInst [current_bd_instance .]
-
-  # Set parent object as current
-  current_bd_instance $parentObj
-
-
-  # Create interface ports
-  set DDR [ create_bd_intf_port -mode Master -vlnv xilinx.com:interface:ddrx_rtl:1.0 DDR ]
-  set FIXED_IO [ create_bd_intf_port -mode Master \
-    -vlnv xilinx.com:display_processing_system7:fixedio_rtl:1.0 FIXED_IO ]
-
-  # Create ports
-
-  # Create instance: axi_interconnect_1, and set properties
-  set axi_interconnect_1 \
-    [ create_bd_cell -type ip -vlnv xilinx.com:ip:axi_interconnect:2.1 axi_interconnect_1 ]
-  set_property -dict [ list \
-    CONFIG.NUM_MI {5} \
-  ] $axi_interconnect_1
-
-  # Create instance: axi_smc, and set properties
-  set axi_smc [ create_bd_cell -type ip -vlnv xilinx.com:ip:smartconnect:1.0 axi_smc ]
-  set_property -dict [ list \
-    CONFIG.NUM_SI {5} \
-  ] $axi_smc
-
-  # Create instance: axi_timer_1, and set properties
-  set axi_timer_1 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axi_timer:2.0 axi_timer_1 ]
-
-  # Create instance: compute_0, and set properties
-  set compute_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:compute:1.0 compute_0 ]
-  set_property -dict [ list \
-    CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE {"1111"} \
-    CONFIG.C_M_AXI_DATA_PORT_DATA_WIDTH {64} \
-    CONFIG.C_M_AXI_UOP_PORT_CACHE_VALUE {"1111"} \
-  ] $compute_0
-
-  # Create instance: fetch_0, and set properties
-  set fetch_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:fetch:1.0 fetch_0 ]
-  set_property -dict [ list \
-    CONFIG.C_M_AXI_INS_PORT_CACHE_VALUE {"1111"} \
-    CONFIG.C_M_AXI_INS_PORT_DATA_WIDTH {64} \
-  ] $fetch_0
-
-  # Create instance: g2l_queue, and set properties
-  set g2l_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 g2l_queue ]
-  set_property -dict [ list \
-    CONFIG.Empty_Threshold_Assert_Value_axis {1022} \
-    CONFIG.Empty_Threshold_Assert_Value_rach {14} \
-    CONFIG.Empty_Threshold_Assert_Value_wach {14} \
-    CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
-    CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
-    CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
-    CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
-    CONFIG.Full_Flags_Reset_Value {1} \
-    CONFIG.Full_Threshold_Assert_Value_axis {1023} \
-    CONFIG.Full_Threshold_Assert_Value_rach {15} \
-    CONFIG.Full_Threshold_Assert_Value_wach {15} \
-    CONFIG.Full_Threshold_Assert_Value_wrch {15} \
-    CONFIG.INTERFACE_TYPE {AXI_STREAM} \
-    CONFIG.Input_Depth_axis {1024} \
-    CONFIG.Reset_Type {Asynchronous_Reset} \
-    CONFIG.TUSER_WIDTH {0} \
-  ] $g2l_queue
-
-  # Create instance: g2s_queue, and set properties
-  set g2s_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 g2s_queue ]
-  set_property -dict [ list \
-    CONFIG.Empty_Threshold_Assert_Value_axis {1022} \
-    CONFIG.Empty_Threshold_Assert_Value_rach {14} \
-    CONFIG.Empty_Threshold_Assert_Value_wach {14} \
-    CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
-    CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
-    CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
-    CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
-    CONFIG.Full_Flags_Reset_Value {1} \
-    CONFIG.Full_Threshold_Assert_Value_axis {1023} \
-    CONFIG.Full_Threshold_Assert_Value_rach {15} \
-    CONFIG.Full_Threshold_Assert_Value_wach {15} \
-    CONFIG.Full_Threshold_Assert_Value_wrch {15} \
-    CONFIG.INTERFACE_TYPE {AXI_STREAM} \
-    CONFIG.Input_Depth_axis {1024} \
-    CONFIG.Reset_Type {Asynchronous_Reset} \
-    CONFIG.TUSER_WIDTH {0} \
-  ] $g2s_queue
-
-  # Create instance: gemm_queue, and set properties
-  set gemm_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 gemm_queue ]
+# Procedure to initialize FIFO
+proc init_fifo_property {fifo width_bytes depth} {
   set_property -dict [ list \
-    CONFIG.Empty_Threshold_Assert_Value_axis {510} \
-    CONFIG.Empty_Threshold_Assert_Value_rach {14} \
-    CONFIG.Empty_Threshold_Assert_Value_wach {14} \
-    CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
     CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
     CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
     CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
     CONFIG.Full_Flags_Reset_Value {1} \
-    CONFIG.Full_Threshold_Assert_Value_axis {511} \
-    CONFIG.Full_Threshold_Assert_Value_rach {15} \
-    CONFIG.Full_Threshold_Assert_Value_wach {15} \
-    CONFIG.Full_Threshold_Assert_Value_wrch {15} \
     CONFIG.INTERFACE_TYPE {AXI_STREAM} \
-    CONFIG.Input_Depth_axis {512} \
+    CONFIG.Input_Depth_axis $depth \
     CONFIG.Reset_Type {Asynchronous_Reset} \
-    CONFIG.TDATA_NUM_BYTES {16} \
-    CONFIG.TKEEP_WIDTH {16} \
-    CONFIG.TSTRB_WIDTH {16} \
-    CONFIG.TUSER_WIDTH {0} \
-  ] $gemm_queue
-
-  # Create instance: l2g_queue, and set properties
-  set l2g_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 l2g_queue ]
-  set_property -dict [ list \
-    CONFIG.Empty_Threshold_Assert_Value_axis {1022} \
-    CONFIG.Empty_Threshold_Assert_Value_rach {14} \
-    CONFIG.Empty_Threshold_Assert_Value_wach {14} \
-    CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
-    CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
-    CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
-    CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
-    CONFIG.Full_Flags_Reset_Value {1} \
-    CONFIG.Full_Threshold_Assert_Value_axis {1023} \
-    CONFIG.Full_Threshold_Assert_Value_rach {15} \
-    CONFIG.Full_Threshold_Assert_Value_wach {15} \
-    CONFIG.Full_Threshold_Assert_Value_wrch {15} \
-    CONFIG.INTERFACE_TYPE {AXI_STREAM} \
-    CONFIG.Input_Depth_axis {1024} \
-    CONFIG.Reset_Type {Asynchronous_Reset} \
-    CONFIG.TUSER_WIDTH {0} \
-  ] $l2g_queue
-
-  # Create instance: load_0, and set properties
-  set load_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:load:1.0 load_0 ]
-  set_property -dict [ list \
-    CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE {"1111"} \
-  ] $load_0
+    CONFIG.TDATA_NUM_BYTES $width_bytes \
+  ] $fifo
+}
 
-  # Create instance: load_queue, and set properties
-  set load_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 load_queue ]
-  set_property -dict [ list \
-    CONFIG.Empty_Threshold_Assert_Value_axis {510} \
-    CONFIG.Empty_Threshold_Assert_Value_rach {14} \
-    CONFIG.Empty_Threshold_Assert_Value_wach {14} \
-    CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
-    CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
-    CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
-    CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
-    CONFIG.Full_Flags_Reset_Value {1} \
-    CONFIG.Full_Threshold_Assert_Value_axis {511} \
-    CONFIG.Full_Threshold_Assert_Value_rach {15} \
-    CONFIG.Full_Threshold_Assert_Value_wach {15} \
-    CONFIG.Full_Threshold_Assert_Value_wrch {15} \
-    CONFIG.INTERFACE_TYPE {AXI_STREAM} \
-    CONFIG.Input_Depth_axis {512} \
-    CONFIG.Reset_Type {Asynchronous_Reset} \
-    CONFIG.TDATA_NUM_BYTES {16} \
-    CONFIG.TKEEP_WIDTH {16} \
-    CONFIG.TSTRB_WIDTH {16} \
-    CONFIG.TUSER_WIDTH {0} \
-  ] $load_queue
-
-  # Create instance: proc_sys_reset, and set properties
-  set proc_sys_reset \
-    [ create_bd_cell -type ip -vlnv xilinx.com:ip:proc_sys_reset:5.0 proc_sys_reset ]
-
-  # Create instance: processing_system7_1, and set properties
-  set processing_system7_1 \
-    [ create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_1 ]
+# Procedure to initialize BRAM
+proc init_bram_property {bram width depth} {
   set_property -dict [ list \
-    CONFIG.PCW_CAN0_PERIPHERAL_ENABLE {0} \
-    CONFIG.PCW_ENET0_PERIPHERAL_ENABLE {0} \
-    CONFIG.PCW_EN_CLK0_PORT {1} \
-    CONFIG.PCW_EN_CLK1_PORT {1} \
-    CONFIG.PCW_EN_CLK2_PORT {1} \
-    CONFIG.PCW_EN_CLK3_PORT {1} \
-    CONFIG.PCW_FPGA0_PERIPHERAL_FREQMHZ {100} \
-    CONFIG.PCW_FPGA1_PERIPHERAL_FREQMHZ {142.86} \
-    CONFIG.PCW_FPGA2_PERIPHERAL_FREQMHZ {200} \
-    CONFIG.PCW_FPGA3_PERIPHERAL_FREQMHZ {167} \
-    CONFIG.PCW_GPIO_MIO_GPIO_ENABLE {0} \
-    CONFIG.PCW_I2C0_PERIPHERAL_ENABLE {0} \
-    CONFIG.PCW_IMPORT_BOARD_PRESET {None} \
-    CONFIG.PCW_IRQ_F2P_INTR {1} \
-    CONFIG.PCW_QSPI_GRP_SINGLE_SS_ENABLE {0} \
-    CONFIG.PCW_QSPI_PERIPHERAL_ENABLE {0} \
-    CONFIG.PCW_SD0_PERIPHERAL_ENABLE {0} \
-    CONFIG.PCW_USB0_PERIPHERAL_ENABLE {0} \
-    CONFIG.PCW_USE_DEFAULT_ACP_USER_VAL {1} \
-    CONFIG.PCW_USE_FABRIC_INTERRUPT {1} \
-    CONFIG.PCW_USE_HIGH_OCM {1} \
-    CONFIG.PCW_USE_S_AXI_ACP {1} \
-    CONFIG.PCW_USE_S_AXI_HP0 {0} \
-    CONFIG.PCW_USE_S_AXI_HP1 {0} \
-    CONFIG.PCW_USE_S_AXI_HP2 {0} \
-    CONFIG.PCW_USE_S_AXI_HP3 {0} \
-    CONFIG.preset {ZC702} \
-  ] $processing_system7_1
+    CONFIG.Assume_Synchronous_Clk {true} \
+    CONFIG.Byte_Size {8} \
+    CONFIG.Enable_32bit_Address {true} \
+    CONFIG.Enable_B {Use_ENB_Pin} \
+    CONFIG.Memory_Type {True_Dual_Port_RAM} \
+    CONFIG.Read_Width_A $width \
+    CONFIG.Read_Width_B $width \
+    CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
+    CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
+    CONFIG.Use_Byte_Write_Enable {true} \
+    CONFIG.Use_RSTA_Pin {true} \
+    CONFIG.Use_RSTB_Pin {true} \
+    CONFIG.Write_Depth_A $depth \
+    CONFIG.Write_Width_A $width \
+    CONFIG.Write_Width_B $width \
+  ] $bram
+}
 
-  # Create instance: s2g_queue, and set properties
-  set s2g_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 s2g_queue ]
-  set_property -dict [ list \
-    CONFIG.Empty_Threshold_Assert_Value_axis {1022} \
-    CONFIG.Empty_Threshold_Assert_Value_rach {14} \
-    CONFIG.Empty_Threshold_Assert_Value_wach {14} \
-    CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
-    CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
-    CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
-    CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
-    CONFIG.Full_Flags_Reset_Value {1} \
-    CONFIG.Full_Threshold_Assert_Value_axis {1023} \
-    CONFIG.Full_Threshold_Assert_Value_rach {15} \
-    CONFIG.Full_Threshold_Assert_Value_wach {15} \
-    CONFIG.Full_Threshold_Assert_Value_wrch {15} \
-    CONFIG.INTERFACE_TYPE {AXI_STREAM} \
-    CONFIG.Input_Depth_axis {1024} \
-    CONFIG.Reset_Type {Asynchronous_Reset} \
-    CONFIG.TUSER_WIDTH {0} \
-  ] $s2g_queue
+# Create instance: proc_sys_reset, and set properties
+set proc_sys_reset \
+  [ create_bd_cell -type ip -vlnv xilinx.com:ip:proc_sys_reset:5.0 proc_sys_reset ]
+
+# Create instance: pll_clk, and set properties
+set pll_clk [ create_bd_cell -type ip -vlnv xilinx.com:ip:clk_wiz:6.0 pll_clk ]
+set_property -dict [ list \
+  CONFIG.CLKOUT1_REQUESTED_OUT_FREQ $clock_freq \
+  CONFIG.RESET_PORT {resetn} \
+  CONFIG.RESET_TYPE {ACTIVE_LOW} \
+  CONFIG.USE_LOCKED {false} \
+] $pll_clk
+
+# Create instance: axi_smc0, and set properties
+set axi_smc0 [ create_bd_cell -type ip -vlnv xilinx.com:ip:smartconnect:1.0 axi_smc0 ]
+set_property -dict [ list \
+  CONFIG.NUM_MI {1} \
+  CONFIG.NUM_SI {5} \
+] $axi_smc0
+
+# Create instance: axi_xbar, and set properties
+set axi_xbar \
+  [ create_bd_cell -type ip -vlnv xilinx.com:ip:axi_interconnect:2.1 axi_xbar ]
+set_property -dict [ list \
+  CONFIG.NUM_MI {4} \
+  CONFIG.NUM_SI {1} \
+] $axi_xbar
+
+# Create instance: fetch_0, and set properties
+set fetch_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:fetch:1.0 fetch_0 ]
+set_property -dict [ list \
+  CONFIG.C_M_AXI_INS_PORT_CACHE_VALUE $axi_cache \
+  CONFIG.C_M_AXI_INS_PORT_PROT_VALUE $axi_prot \
+] $fetch_0
+
+# Create instance: load_0, and set properties
+set load_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:load:1.0 load_0 ]
+set_property -dict [ list \
+  CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE $axi_cache \
+  CONFIG.C_M_AXI_DATA_PORT_PROT_VALUE $axi_prot \
+] $load_0
+
+# Create instance: compute_0, and set properties
+set compute_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:compute:1.0 compute_0 ]
+set_property -dict [ list \
+  CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE $axi_cache \
+  CONFIG.C_M_AXI_DATA_PORT_PROT_VALUE $axi_prot \
+  CONFIG.C_M_AXI_UOP_PORT_CACHE_VALUE $axi_cache \
+  CONFIG.C_M_AXI_UOP_PORT_PROT_VALUE $axi_prot \
+] $compute_0
+
+# Create instance: store_0, and set properties
+set store_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:store:1.0 store_0 ]
+set_property -dict [ list \
+  CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE $axi_cache \
+  CONFIG.C_M_AXI_DATA_PORT_PROT_VALUE $axi_prot \
+] $store_0
+
+# Create command queues and set properties
+set cmd_queue_list {load_queue gemm_queue store_queue}
+foreach cmd_queue $cmd_queue_list {
+  set tmp_cmd_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 $cmd_queue ]
+  # Width is 16B (128b, as set in hw_spec.h), depth is 512 (depth of FIFO on Zynq 7000 and Zynq Ultrascale+)
+  # TODO: derive it from vta_config.h
+  [ init_fifo_property $tmp_cmd_queue 16 512 ]
+}
 
-  # Create instance: store_0, and set properties
-  set store_0 [ create_bd_cell -type ip -vlnv xilinx.com:hls:store:1.0 store_0 ]
-  set_property -dict [ list \
-CONFIG.C_M_AXI_DATA_PORT_CACHE_VALUE {"1111"} \
-  ] $store_0
+# Create dependence queues and set properties
+set dep_queue_list {l2g_queue g2l_queue g2s_queue s2g_queue}
+foreach dep_queue $dep_queue_list {
+  set tmp_dep_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 $dep_queue ]
+  # Width is 1B (min width), depth is 1024
+  # TODO: derive it from vta_config.h
+  [ init_fifo_property $tmp_dep_queue 1 1024 ]
+}
 
-  # Create instance: store_queue, and set properties
-  set store_queue [ create_bd_cell -type ip -vlnv xilinx.com:ip:fifo_generator:13.2 store_queue ]
-  set_property -dict [ list \
-    CONFIG.Empty_Threshold_Assert_Value_axis {510} \
-    CONFIG.Empty_Threshold_Assert_Value_rach {14} \
-    CONFIG.Empty_Threshold_Assert_Value_wach {14} \
-    CONFIG.Empty_Threshold_Assert_Value_wrch {14} \
-    CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} \
-    CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} \
-    CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} \
-    CONFIG.Full_Flags_Reset_Value {1} \
-    CONFIG.Full_Threshold_Assert_Value_axis {511} \
-    CONFIG.Full_Threshold_Assert_Value_rach {15} \
-    CONFIG.Full_Threshold_Assert_Value_wach {15} \
-    CONFIG.Full_Threshold_Assert_Value_wrch {15} \
-    CONFIG.INTERFACE_TYPE {AXI_STREAM} \
-    CONFIG.Input_Depth_axis {512} \
-    CONFIG.Reset_Type {Asynchronous_Reset} \
-    CONFIG.TDATA_NUM_BYTES {16} \
-    CONFIG.TKEEP_WIDTH {16} \
-    CONFIG.TSTRB_WIDTH {16} \
-    CONFIG.TUSER_WIDTH {0} \
-  ] $store_queue
-
-  # Create instance: xlconcat_1, and set properties
-  set xlconcat_1 [ create_bd_cell -type ip -vlnv xilinx.com:ip:xlconcat:2.1 xlconcat_1 ]
-  set_property -dict [ list \
-CONFIG.NUM_PORTS {5} \
-  ] $xlconcat_1
-
-  # Create and connect inp_mem partitions
-  if {${inp_part} > 1} {
-    for {set i 0} {$i < ${inp_part}} {incr i} {
-      # Create instance: inp_mem, and set properties
-      set inp_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 inp_mem_${i} ]
-      set_property -dict [ list \
-        CONFIG.Byte_Size {8} \
-        CONFIG.Enable_32bit_Address {true} \
-        CONFIG.Enable_B {Use_ENB_Pin} \
-        CONFIG.Memory_Type {True_Dual_Port_RAM} \
-        CONFIG.Read_Width_A $inp_bus_width \
-        CONFIG.Read_Width_B $inp_bus_width \
-        CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
-        CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
-        CONFIG.Use_Byte_Write_Enable {true} \
-        CONFIG.Use_RSTA_Pin {true} \
-        CONFIG.Use_RSTB_Pin {true} \
-        CONFIG.Write_Depth_A $inp_mem_depth \
-        CONFIG.Write_Width_A $inp_bus_width \
-        CONFIG.Write_Width_B $inp_bus_width \
-        CONFIG.use_bram_block {BRAM_Controller} \
-      ] $inp_mem
-      # Create interface connections
-      connect_bd_intf_net -intf_net load_0_inp_mem_${i}_V_PORTA \
-        [get_bd_intf_pins $inp_mem/BRAM_PORTA] \
-        [get_bd_intf_pins load_0/inp_mem_${i}_V_PORTA]
-      connect_bd_intf_net -intf_net compute_0_inp_mem_${i}_V_PORTA \
-        [get_bd_intf_pins compute_0/inp_mem_${i}_V_PORTA] \
-        [get_bd_intf_pins $inp_mem/BRAM_PORTB]
-    }
+# Create and connect inp_mem partitions
+for {set i 0} {$i < $inp_part} {incr i} {
+  # Create instance: inp_mem, and set properties
+  set inp_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 inp_mem_${i} ]
+  [ init_bram_property $inp_mem $inp_mem_width $inp_mem_depth ]
+  # If module has more than 1 mem port, the naming convention changes
+  if {$inp_part > 1} {
+    set porta [get_bd_intf_pins load_0/inp_mem_${i}_V_PORTA]
+    set portb [get_bd_intf_pins compute_0/inp_mem_${i}_V_PORTA]
   } else {
-      # Create instance: inp_mem, and set properties
-      set inp_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 inp_mem ]
-      set_property -dict [ list \
-        CONFIG.Byte_Size {8} \
-        CONFIG.Enable_32bit_Address {true} \
-        CONFIG.Enable_B {Use_ENB_Pin} \
-        CONFIG.Memory_Type {True_Dual_Port_RAM} \
-        CONFIG.Read_Width_A $inp_bus_width \
-        CONFIG.Read_Width_B $inp_bus_width \
-        CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
-        CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
-        CONFIG.Use_Byte_Write_Enable {true} \
-        CONFIG.Use_RSTA_Pin {true} \
-        CONFIG.Use_RSTB_Pin {true} \
-        CONFIG.Write_Depth_A $inp_mem_depth \
-        CONFIG.Write_Width_A $inp_bus_width \
-        CONFIG.Write_Width_B $inp_bus_width \
-        CONFIG.use_bram_block {BRAM_Controller} \
-      ] $inp_mem
-      # Create interface connections
-      connect_bd_intf_net -intf_net load_0_inp_mem_V_PORTA \
-        [get_bd_intf_pins $inp_mem/BRAM_PORTA] \
-        [get_bd_intf_pins load_0/inp_mem_V_PORTA]
-      connect_bd_intf_net -intf_net compute_0_inp_mem_V_PORTA \
-        [get_bd_intf_pins compute_0/inp_mem_V_PORTA] \
-        [get_bd_intf_pins $inp_mem/BRAM_PORTB]
+    set porta [get_bd_intf_pins load_0/inp_mem_V_PORTA]
+    set portb [get_bd_intf_pins compute_0/inp_mem_V_PORTA]
   }
+  # Create interface connections
+  connect_bd_intf_net -intf_net load_0_inp_mem_V_PORTA \
+    [get_bd_intf_pins $inp_mem/BRAM_PORTA] \
+    $porta
+  connect_bd_intf_net -intf_net compute_0_inp_mem_V_PORTA \
+    [get_bd_intf_pins $inp_mem/BRAM_PORTB] \
+    $portb
+}
 
-  # Create and connect wgt_mem partitions
-  if {${wgt_part} > 1} {
-    for {set i 0} {$i < ${wgt_part}} {incr i} {
-      # Create instance: wgt_mem, and set properties
-      set wgt_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 wgt_mem_${i} ]
-      set_property -dict [ list \
-        CONFIG.Assume_Synchronous_Clk {true} \
-        CONFIG.Byte_Size {8} \
-        CONFIG.Enable_32bit_Address {true} \
-        CONFIG.Enable_B {Use_ENB_Pin} \
-        CONFIG.Memory_Type {True_Dual_Port_RAM} \
-        CONFIG.Read_Width_A $wgt_bus_width \
-        CONFIG.Read_Width_B $wgt_bus_width \
-        CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
-        CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
-        CONFIG.Use_Byte_Write_Enable {true} \
-        CONFIG.Use_RSTA_Pin {true} \
-        CONFIG.Use_RSTB_Pin {true} \
-        CONFIG.Write_Depth_A $wgt_mem_depth \
-        CONFIG.Write_Width_A $wgt_bus_width \
-        CONFIG.Write_Width_B $wgt_bus_width \
-      ] $wgt_mem
-      # Create interface connections
-      connect_bd_intf_net -intf_net load_0_wgt_mem_${i}_V_PORTA \
-        [get_bd_intf_pins load_0/wgt_mem_${i}_V_PORTA] \
-        [get_bd_intf_pins $wgt_mem/BRAM_PORTA]
-      connect_bd_intf_net -intf_net compute_0_wgt_mem_${i}_V_PORTA \
-        [get_bd_intf_pins compute_0/wgt_mem_${i}_V_PORTA] \
-        [get_bd_intf_pins $wgt_mem/BRAM_PORTB]
-    }
+# Create and connect wgt_mem partitions
+for {set i 0} {$i < $wgt_part} {incr i} {
+  # Create instance: wgt_mem, and set properties
+  set wgt_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 wgt_mem_${i} ]
+  [ init_bram_property $wgt_mem $wgt_mem_width $wgt_mem_depth ]
+  # If module has more than 1 mem port, the naming convention changes
+  if {$wgt_part > 1} {
+    set porta [get_bd_intf_pins load_0/wgt_mem_${i}_V_PORTA]
+    set portb [get_bd_intf_pins compute_0/wgt_mem_${i}_V_PORTA]
   } else {
-      # Create instance: wgt_mem, and set properties
-      set wgt_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 wgt_mem ]
-      set_property -dict [ list \
-        CONFIG.Assume_Synchronous_Clk {true} \
-        CONFIG.Byte_Size {8} \
-        CONFIG.Enable_32bit_Address {true} \
-        CONFIG.Enable_B {Use_ENB_Pin} \
-        CONFIG.Memory_Type {True_Dual_Port_RAM} \
-        CONFIG.Read_Width_A $wgt_bus_width \
-        CONFIG.Read_Width_B $wgt_bus_width \
-        CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
-        CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
-        CONFIG.Use_Byte_Write_Enable {true} \
-        CONFIG.Use_RSTA_Pin {true} \
-        CONFIG.Use_RSTB_Pin {true} \
-        CONFIG.Write_Depth_A $wgt_mem_depth \
-        CONFIG.Write_Width_A $wgt_bus_width \
-        CONFIG.Write_Width_B $wgt_bus_width \
-      ] $wgt_mem
-      # Create interface connections
-      connect_bd_intf_net -intf_net load_0_wgt_mem_V_PORTA \
-        [get_bd_intf_pins load_0/wgt_mem_V_PORTA] \
-        [get_bd_intf_pins $wgt_mem/BRAM_PORTA]
-      connect_bd_intf_net -intf_net compute_0_wgt_mem_V_PORTA \
-        [get_bd_intf_pins compute_0/wgt_mem_V_PORTA] \
-        [get_bd_intf_pins $wgt_mem/BRAM_PORTB]
+    set porta [get_bd_intf_pins load_0/wgt_mem_V_PORTA]
+    set portb [get_bd_intf_pins compute_0/wgt_mem_V_PORTA]
   }
+  # Create interface connections
+  connect_bd_intf_net -intf_net load_0_wgt_mem_${i}_V_PORTA \
+    [get_bd_intf_pins $wgt_mem/BRAM_PORTA] \
+    $porta
+  connect_bd_intf_net -intf_net compute_0_wgt_mem_${i}_V_PORTA \
+    [get_bd_intf_pins $wgt_mem/BRAM_PORTB] \
+    $portb
+}
 
-  # Create and connect out_mem partitions
-  if {${out_part} > 1} {
-    for {set i 0} {$i < ${out_part}} {incr i} {
-      # Create instance: out_mem, and set properties
-      set out_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 out_mem_${i} ]
-      set_property -dict [ list \
-        CONFIG.Byte_Size {8} \
-        CONFIG.Enable_32bit_Address {true} \
-        CONFIG.Enable_B {Use_ENB_Pin} \
-        CONFIG.Memory_Type {True_Dual_Port_RAM} \
-        CONFIG.Read_Width_A $out_bus_width \
-        CONFIG.Read_Width_B $out_bus_width \
-        CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
-        CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
-        CONFIG.Use_Byte_Write_Enable {true} \
-        CONFIG.Use_RSTA_Pin {true} \
-        CONFIG.Use_RSTB_Pin {true} \
-        CONFIG.Write_Depth_A $out_mem_depth \
-        CONFIG.Write_Width_A $out_bus_width \
-        CONFIG.Write_Width_B $out_bus_width \
-        CONFIG.use_bram_block {BRAM_Controller} \
-      ] $out_mem
-      # Create interface connections
-      connect_bd_intf_net -intf_net compute_0_out_mem_${i}_V_PORTA \
-        [get_bd_intf_pins compute_0/out_mem_${i}_V_PORTA] \
-        [get_bd_intf_pins $out_mem/BRAM_PORTA]
-      connect_bd_intf_net -intf_net store_0_out_mem_${i}_V_PORTA \
-        [get_bd_intf_pins $out_mem/BRAM_PORTB] \
-        [get_bd_intf_pins store_0/out_mem_${i}_V_PORTA]
-    }
+# Create and connect out_mem partitions
+for {set i 0} {$i < $out_part} {incr i} {
+  # Create instance: out_mem, and set properties
+  set out_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 out_mem_${i} ]
+  [ init_bram_property $out_mem $out_mem_width $out_mem_depth ]
+  # If module has more than 1 mem port, the naming convention changes
+  if {$out_part > 1} {
+    set porta [get_bd_intf_pins compute_0/out_mem_${i}_V_PORTA]
+    set portb [get_bd_intf_pins store_0/out_mem_${i}_V_PORTA]
   } else {
-      # Create instance: out_mem, and set properties
-      set out_mem [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 out_mem ]
-      set_property -dict [ list \
-        CONFIG.Byte_Size {8} \
-        CONFIG.Enable_32bit_Address {true} \
-        CONFIG.Enable_B {Use_ENB_Pin} \
-        CONFIG.Memory_Type {True_Dual_Port_RAM} \
-        CONFIG.Read_Width_A $out_bus_width \
-        CONFIG.Read_Width_B $out_bus_width \
-        CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \
-        CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \
-        CONFIG.Use_Byte_Write_Enable {true} \
-        CONFIG.Use_RSTA_Pin {true} \
-        CONFIG.Use_RSTB_Pin {true} \
-        CONFIG.Write_Depth_A $out_mem_depth \
-        CONFIG.Write_Width_A $out_bus_width \
-        CONFIG.Write_Width_B $out_bus_width \
-        CONFIG.use_bram_block {BRAM_Controller} \
-      ] $out_mem
-      # Create interface connections
-      connect_bd_intf_net -intf_net compute_0_out_mem_V_PORTA \
-        [get_bd_intf_pins compute_0/out_mem_V_PORTA] \
-        [get_bd_intf_pins $out_mem/BRAM_PORTA]
-      connect_bd_intf_net -intf_net store_0_out_mem_V_PORTA \
-        [get_bd_intf_pins $out_mem/BRAM_PORTB] \
-        [get_bd_intf_pins store_0/out_mem_V_PORTA]
+    set porta [get_bd_intf_pins compute_0/out_mem_V_PORTA]
+    set portb [get_bd_intf_pins store_0/out_mem_V_PORTA]
   }
-
   # Create interface connections
-  connect_bd_intf_net -intf_net axi_interconnect_1_M01_AXI \
-    [get_bd_intf_pins axi_interconnect_1/M01_AXI] \
-    [get_bd_intf_pins fetch_0/s_axi_CONTROL_BUS]
-  connect_bd_intf_net -intf_net axi_interconnect_1_M02_AXI \
-    [get_bd_intf_pins axi_interconnect_1/M02_AXI] \
-    [get_bd_intf_pins load_0/s_axi_CONTROL_BUS]
-  connect_bd_intf_net -intf_net axi_interconnect_1_M03_AXI \
-    [get_bd_intf_pins axi_interconnect_1/M03_AXI] \
-    [get_bd_intf_pins compute_0/s_axi_CONTROL_BUS]
-  connect_bd_intf_net -intf_net axi_interconnect_1_M04_AXI \
-    [get_bd_intf_pins axi_interconnect_1/M04_AXI] \
-    [get_bd_intf_pins store_0/s_axi_CONTROL_BUS]
-  connect_bd_intf_net -intf_net axi_smc_M00_AXI \
-    [get_bd_intf_pins axi_smc/M00_AXI] \
-    [get_bd_intf_pins processing_system7_1/S_AXI_ACP]
-  connect_bd_intf_net -intf_net compute_0_g2l_dep_queue_V \
-    [get_bd_intf_pins compute_0/g2l_dep_queue_V] \
-    [get_bd_intf_pins g2l_queue/S_AXIS]
-  connect_bd_intf_net -intf_net compute_0_g2s_dep_queue_V \
-    [get_bd_intf_pins compute_0/g2s_dep_queue_V] \
-    [get_bd_intf_pins g2s_queue/S_AXIS]
-  connect_bd_intf_net -intf_net compute_0_m_axi_data_port \
-    [get_bd_intf_pins axi_smc/S02_AXI] \
-    [get_bd_intf_pins compute_0/m_axi_data_port]
-  connect_bd_intf_net -intf_net compute_0_m_axi_uop_port \
-    [get_bd_intf_pins axi_smc/S01_AXI] \
-    [get_bd_intf_pins compute_0/m_axi_uop_port]
-  connect_bd_intf_net -intf_net fetch_0_gemm_queue_V_V \
-    [get_bd_intf_pins fetch_0/gemm_queue_V_V] \
-    [get_bd_intf_pins gemm_queue/S_AXIS]
-  connect_bd_intf_net -intf_net fetch_0_l2g_dep_queue_V \
-    [get_bd_intf_pins l2g_queue/S_AXIS] \
-    [get_bd_intf_pins load_0/l2g_dep_queue_V]
-  connect_bd_intf_net -intf_net fetch_0_load_queue_V_V \
-    [get_bd_intf_pins fetch_0/load_queue_V_V] \
-    [get_bd_intf_pins load_queue/S_AXIS]
-  connect_bd_intf_net -intf_net fetch_0_m_axi_ins_port \
-    [get_bd_intf_pins axi_smc/S00_AXI] \
-    [get_bd_intf_pins fetch_0/m_axi_ins_port]
-  connect_bd_intf_net -intf_net fetch_0_store_queue_V_V \
-    [get_bd_intf_pins fetch_0/store_queue_V_V] \
-    [get_bd_intf_pins store_queue/S_AXIS]
-  connect_bd_intf_net -intf_net g2l_queue_M_AXIS \
-    [get_bd_intf_pins g2l_queue/M_AXIS] \
-    [get_bd_intf_pins load_0/g2l_dep_queue_V]
-  connect_bd_intf_net -intf_net g2s_queue_M_AXIS \
-    [get_bd_intf_pins g2s_queue/M_AXIS] \
-    [get_bd_intf_pins store_0/g2s_dep_queue_V]
-  connect_bd_intf_net -intf_net gemm_queue_M_AXIS \
-    [get_bd_intf_pins compute_0/gemm_queue_V_V] \
-    [get_bd_intf_pins gemm_queue/M_AXIS]
-  connect_bd_intf_net -intf_net l2g_queue_M_AXIS \
-    [get_bd_intf_pins compute_0/l2g_dep_queue_V] \
-    [get_bd_intf_pins l2g_queue/M_AXIS]
-  connect_bd_intf_net -intf_net load_0_m_axi_data_port \
-    [get_bd_intf_pins axi_smc/S03_AXI] \
-    [get_bd_intf_pins load_0/m_axi_data_port]
-  connect_bd_intf_net -intf_net load_queue_M_AXIS \
-    [get_bd_intf_pins load_0/load_queue_V_V] \
-    [get_bd_intf_pins load_queue/M_AXIS]
-  connect_bd_intf_net -intf_net processing_system7_1_axi_periph_m00_axi \
-    [get_bd_intf_pins axi_interconnect_1/M00_AXI] \
-    [get_bd_intf_pins axi_timer_1/S_AXI]
-  connect_bd_intf_net -intf_net processing_system7_1_ddr \
-    [get_bd_intf_ports DDR] \
-    [get_bd_intf_pins processing_system7_1/DDR]
-  connect_bd_intf_net -intf_net processing_system7_1_fixed_io \
-    [get_bd_intf_ports FIXED_IO] \
-    [get_bd_intf_pins processing_system7_1/FIXED_IO]
-  connect_bd_intf_net -intf_net processing_system7_1_m_axi_gp0 \
-    [get_bd_intf_pins axi_interconnect_1/S00_AXI] \
-    [get_bd_intf_pins processing_system7_1/M_AXI_GP0]
-  connect_bd_intf_net -intf_net s2g_queue_M_AXIS \
-    [get_bd_intf_pins compute_0/s2g_dep_queue_V] \
-    [get_bd_intf_pins s2g_queue/M_AXIS]
-  connect_bd_intf_net -intf_net store_0_m_axi_data_port \
-    [get_bd_intf_pins axi_smc/S04_AXI] \
-    [get_bd_intf_pins store_0/m_axi_data_port]
-  connect_bd_intf_net -intf_net store_0_s2g_dep_queue_V \
-    [get_bd_intf_pins s2g_queue/S_AXIS] \
-    [get_bd_intf_pins store_0/s2g_dep_queue_V]
-  connect_bd_intf_net -intf_net store_queue_M_AXIS \
-    [get_bd_intf_pins store_0/store_queue_V_V] \
-    [get_bd_intf_pins store_queue/M_AXIS]
-
-  # Create port connections
-  connect_bd_net -net axi_timer_1_interrupt \
-    [get_bd_pins axi_timer_1/interrupt] \
-    [get_bd_pins xlconcat_1/In0]
-  connect_bd_net -net compute_0_interrupt \
-    [get_bd_pins compute_0/interrupt] \
-    [get_bd_pins xlconcat_1/In3]
-  connect_bd_net -net fetch_0_interrupt \
-    [get_bd_pins fetch_0/interrupt] \
-    [get_bd_pins xlconcat_1/In1]
-  connect_bd_net -net load_0_interrupt \
-    [get_bd_pins load_0/interrupt] \
-    [get_bd_pins xlconcat_1/In2]
-  connect_bd_net -net proc_sys_reset_interconnect_aresetn \
-    [get_bd_pins axi_interconnect_1/ARESETN] \
-    [get_bd_pins proc_sys_reset/interconnect_aresetn]
-  connect_bd_net -net proc_sys_reset_peripheral_aresetn \
-    [get_bd_pins axi_interconnect_1/M00_ARESETN] \
-    [get_bd_pins axi_interconnect_1/M01_ARESETN] \
-    [get_bd_pins axi_interconnect_1/M02_ARESETN] \
-    [get_bd_pins axi_interconnect_1/M03_ARESETN] \
-    [get_bd_pins axi_interconnect_1/M04_ARESETN] \
-    [get_bd_pins axi_interconnect_1/S00_ARESETN] \
-    [get_bd_pins axi_smc/aresetn] \
-    [get_bd_pins axi_timer_1/s_axi_aresetn] \
-    [get_bd_pins compute_0/ap_rst_n] \
-    [get_bd_pins fetch_0/ap_rst_n] \
-    [get_bd_pins g2l_queue/s_aresetn] \
-    [get_bd_pins g2s_queue/s_aresetn] \
-    [get_bd_pins gemm_queue/s_aresetn] \
-    [get_bd_pins l2g_queue/s_aresetn] \
-    [get_bd_pins load_0/ap_rst_n] \
-    [get_bd_pins load_queue/s_aresetn] \
-    [get_bd_pins proc_sys_reset/peripheral_aresetn] \
-    [get_bd_pins s2g_queue/s_aresetn] \
-    [get_bd_pins store_0/ap_rst_n] \
-    [get_bd_pins store_queue/s_aresetn]
-  connect_bd_net -net processing_system7_1_FCLK_CLK \
-    [get_bd_pins axi_interconnect_1/ACLK] \
-    [get_bd_pins axi_interconnect_1/M00_ACLK] \
-    [get_bd_pins axi_interconnect_1/M01_ACLK] \
-    [get_bd_pins axi_interconnect_1/M02_ACLK] \
-    [get_bd_pins axi_interconnect_1/M03_ACLK] \
-    [get_bd_pins axi_interconnect_1/M04_ACLK] \
-    [get_bd_pins axi_interconnect_1/S00_ACLK] \
-    [get_bd_pins axi_smc/aclk] \
-    [get_bd_pins axi_timer_1/s_axi_aclk] \
-    [get_bd_pins compute_0/ap_clk] \
-    [get_bd_pins fetch_0/ap_clk] \
-    [get_bd_pins g2l_queue/s_aclk] \
-    [get_bd_pins g2s_queue/s_aclk] \
-    [get_bd_pins gemm_queue/s_aclk] \
-    [get_bd_pins l2g_queue/s_aclk] \
-    [get_bd_pins load_0/ap_clk] \
-    [get_bd_pins load_queue/s_aclk] \
-    [get_bd_pins proc_sys_reset/slowest_sync_clk] \
-    [get_bd_pins processing_system7_1/FCLK_CLK${clk}] \
-    [get_bd_pins processing_system7_1/M_AXI_GP0_ACLK] \
-    [get_bd_pins processing_system7_1/S_AXI_ACP_ACLK] \
-    [get_bd_pins s2g_queue/s_aclk] \
-    [get_bd_pins store_0/ap_clk] \
-    [get_bd_pins store_queue/s_aclk]
-  connect_bd_net -net processing_system7_1_fclk_reset0_n \
-    [get_bd_pins proc_sys_reset/ext_reset_in] \
-    [get_bd_pins processing_system7_1/FCLK_RESET0_N]
-  connect_bd_net -net store_0_interrupt \
-    [get_bd_pins store_0/interrupt] \
-    [get_bd_pins xlconcat_1/In4]
-  connect_bd_net -net xlconcat_1_dout \
-    [get_bd_pins processing_system7_1/IRQ_F2P] \
-    [get_bd_pins xlconcat_1/dout]
-
-  # Create address segments
-  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
-    [get_bd_addr_spaces compute_0/Data_m_axi_uop_port] \
-    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
-    SEG_processing_system7_1_ACP_DDR_LOWOCM
-  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
-    [get_bd_addr_spaces compute_0/Data_m_axi_data_port] \
-    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
-    SEG_processing_system7_1_ACP_DDR_LOWOCM
-  create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
-    [get_bd_addr_spaces compute_0/Data_m_axi_uop_port] \
-    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
-    SEG_processing_system7_1_ACP_HIGH_OCM
-  create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
-    [get_bd_addr_spaces compute_0/Data_m_axi_data_port] \
-    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
-    SEG_processing_system7_1_ACP_HIGH_OCM
-  create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
-    [get_bd_addr_spaces compute_0/Data_m_axi_uop_port] \
-    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
-    SEG_processing_system7_1_ACP_IOP
-  create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
-    [get_bd_addr_spaces compute_0/Data_m_axi_data_port] \
-    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
-    SEG_processing_system7_1_ACP_IOP
-  create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
-    [get_bd_addr_spaces compute_0/Data_m_axi_uop_port] \
-    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
-    SEG_processing_system7_1_ACP_M_AXI_GP0
-  create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
-    [get_bd_addr_spaces compute_0/Data_m_axi_data_port] \
-    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
-    SEG_processing_system7_1_ACP_M_AXI_GP0
-  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
-    [get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] \
-    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
-    SEG_processing_system7_1_ACP_DDR_LOWOCM
-  create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
-    [get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] \
-    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
-    SEG_processing_system7_1_ACP_HIGH_OCM
-  create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
-    [get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] \
-    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
-    SEG_processing_system7_1_ACP_IOP
-  create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
-    [get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] \
-    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
-    SEG_processing_system7_1_ACP_M_AXI_GP0
-  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
-    [get_bd_addr_spaces load_0/Data_m_axi_data_port] \
-    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
-    SEG_processing_system7_1_ACP_DDR_LOWOCM
-  create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
-    [get_bd_addr_spaces load_0/Data_m_axi_data_port] \
-    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
-    SEG_processing_system7_1_ACP_HIGH_OCM
-  create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
-    [get_bd_addr_spaces load_0/Data_m_axi_data_port] \
-    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
-    SEG_processing_system7_1_ACP_IOP
-  create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
-    [get_bd_addr_spaces load_0/Data_m_axi_data_port] \
-    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
-    SEG_processing_system7_1_ACP_M_AXI_GP0
-  create_bd_addr_seg -range 0x00010000 -offset 0x42800000 \
-    [get_bd_addr_spaces processing_system7_1/Data] \
-    [get_bd_addr_segs axi_timer_1/S_AXI/Reg] SEG_axi_timer_1_Reg
-  create_bd_addr_seg -range 0x00010000 -offset 0x43C10000 \
-    [get_bd_addr_spaces processing_system7_1/Data] \
-    [get_bd_addr_segs compute_0/s_axi_CONTROL_BUS/Reg] SEG_compute_0_Reg
-  create_bd_addr_seg -range 0x00010000 -offset 0x43C00000 \
-    [get_bd_addr_spaces processing_system7_1/Data] \
-    [get_bd_addr_segs fetch_0/s_axi_CONTROL_BUS/Reg] SEG_fetch_0_Reg
-  create_bd_addr_seg -range 0x00010000 -offset 0x43C20000 \
-    [get_bd_addr_spaces processing_system7_1/Data] \
-    [get_bd_addr_segs load_0/s_axi_CONTROL_BUS/Reg] SEG_load_0_Reg
-  create_bd_addr_seg -range 0x00010000 -offset 0x43C30000 \
-    [get_bd_addr_spaces processing_system7_1/Data] \
-    [get_bd_addr_segs store_0/s_axi_CONTROL_BUS/Reg] SEG_store_0_Reg
-  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 \
-    [get_bd_addr_spaces store_0/Data_m_axi_data_port] \
-    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_DDR_LOWOCM] \
-    SEG_processing_system7_1_ACP_DDR_LOWOCM
-  create_bd_addr_seg -range 0x00040000 -offset 0xFFFC0000 \
-    [get_bd_addr_spaces store_0/Data_m_axi_data_port] \
-    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_HIGH_OCM] \
-    SEG_processing_system7_1_ACP_HIGH_OCM
-  create_bd_addr_seg -range 0x00400000 -offset 0xE0000000 \
-    [get_bd_addr_spaces store_0/Data_m_axi_data_port] \
-    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_IOP] \
-    SEG_processing_system7_1_ACP_IOP
-  create_bd_addr_seg -range 0x40000000 -offset 0x40000000 \
-    [get_bd_addr_spaces store_0/Data_m_axi_data_port] \
-    [get_bd_addr_segs processing_system7_1/S_AXI_ACP/ACP_M_AXI_GP0] \
-    SEG_processing_system7_1_ACP_M_AXI_GP0
-
-
-  # Restore current instance
-  current_bd_instance $oldCurInst
-
-  save_bd_design
+  connect_bd_intf_net -intf_net compute_0_out_mem_${i}_V_PORTA \
+    [get_bd_intf_pins $out_mem/BRAM_PORTA] \
+    $porta
+  connect_bd_intf_net -intf_net store_0_out_mem_${i}_V_PORTA \
+    [get_bd_intf_pins $out_mem/BRAM_PORTB] \
+    $portb
+}
+
+# Create instance: processing_system, and set properties
+if { $device_family eq "zynq-7000" } {
+  set processing_system [ create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system ]
+  set_property -dict [ list \
+    CONFIG.PCW_EN_CLK0_PORT {1} \
+    CONFIG.PCW_FPGA0_PERIPHERAL_FREQMHZ {100} \
+    CONFIG.PCW_USE_DEFAULT_ACP_USER_VAL {1} \
+    CONFIG.PCW_USE_S_AXI_ACP {1} \
+    CONFIG.preset {ZC702} \
+  ] $processing_system
+  # Get ports that are specific to the Zynq 7000 processing system
+  set ps_clk    [get_bd_pins processing_system/FCLK_CLK0]
+  set ps_rstn   [get_bd_pins processing_system/FCLK_RESET0_N]
+  set maxi_clk  [get_bd_pins processing_system/M_AXI_GP0_ACLK]
+  set saxi_clk  [get_bd_pins processing_system/S_AXI_ACP_ACLK]
+  set maxi      [get_bd_intf_pins processing_system/M_AXI_GP0]
+  set saxi      [get_bd_intf_pins processing_system/S_AXI_ACP]
+} elseif { $device_family eq "zynq-ultrascale+" } {
+  set processing_system [ create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.2 processing_system ]
+  set_property -dict [ list \
+    CONFIG.PSU__FPGA_PL0_ENABLE {1} \
+    CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ {100} \
+    CONFIG.PSU__USE__M_AXI_GP0 {1} \
+    CONFIG.PSU__USE__M_AXI_GP2 {0} \
+    CONFIG.PSU__USE__S_AXI_GP0 {1}
+  ] $processing_system
+  # Get ports that are specific to the Zynq Ultrascale MPSoC processing system
+  set ps_clk    [get_bd_pins processing_system/pl_clk0]
+  set ps_rstn   [get_bd_pins processing_system/pl_resetn0]
+  set maxi_clk  [get_bd_pins processing_system/maxihpm0_fpd_aclk]
+  set saxi_clk  [get_bd_pins processing_system/saxihpc0_fpd_aclk]
+  set maxi      [get_bd_intf_pins processing_system/M_AXI_HPM0_FPD]
+  set saxi      [get_bd_intf_pins processing_system/S_AXI_HPC0_FPD]
 }
-# End of create_root_design()
+
+# Create interface connections
+connect_bd_intf_net -intf_net axi_xbar_M00_AXI [get_bd_intf_pins axi_xbar/M00_AXI] [get_bd_intf_pins fetch_0/s_axi_CONTROL_BUS]
+connect_bd_intf_net -intf_net axi_xbar_M01_AXI [get_bd_intf_pins axi_xbar/M01_AXI] [get_bd_intf_pins load_0/s_axi_CONTROL_BUS]
+connect_bd_intf_net -intf_net axi_xbar_M02_AXI [get_bd_intf_pins axi_xbar/M02_AXI] [get_bd_intf_pins compute_0/s_axi_CONTROL_BUS]
+connect_bd_intf_net -intf_net axi_xbar_M03_AXI [get_bd_intf_pins axi_xbar/M03_AXI] [get_bd_intf_pins store_0/s_axi_CONTROL_BUS]
+connect_bd_intf_net -intf_net fetch_0_l2g_dep_queue_V [get_bd_intf_pins l2g_queue/S_AXIS] [get_bd_intf_pins load_0/l2g_dep_queue_V]
+connect_bd_intf_net -intf_net fetch_0_load_queue_V_V [get_bd_intf_pins fetch_0/load_queue_V_V] [get_bd_intf_pins load_queue/S_AXIS]
+connect_bd_intf_net -intf_net fetch_0_gemm_queue_V_V [get_bd_intf_pins fetch_0/gemm_queue_V_V] [get_bd_intf_pins gemm_queue/S_AXIS]
+connect_bd_intf_net -intf_net fetch_0_store_queue_V_V [get_bd_intf_pins fetch_0/store_queue_V_V] [get_bd_intf_pins store_queue/S_AXIS]
+connect_bd_intf_net -intf_net compute_0_g2l_dep_queue_V [get_bd_intf_pins compute_0/g2l_dep_queue_V] [get_bd_intf_pins g2l_queue/S_AXIS]
+connect_bd_intf_net -intf_net compute_0_g2s_dep_queue_V [get_bd_intf_pins compute_0/g2s_dep_queue_V] [get_bd_intf_pins g2s_queue/S_AXIS]
+connect_bd_intf_net -intf_net store_0_s2g_dep_queue_V [get_bd_intf_pins s2g_queue/S_AXIS] [get_bd_intf_pins store_0/s2g_dep_queue_V]
+connect_bd_intf_net -intf_net load_queue_M_AXIS [get_bd_intf_pins load_0/load_queue_V_V] [get_bd_intf_pins load_queue/M_AXIS]
+connect_bd_intf_net -intf_net gemm_queue_M_AXIS [get_bd_intf_pins compute_0/gemm_queue_V_V] [get_bd_intf_pins gemm_queue/M_AXIS]
+connect_bd_intf_net -intf_net store_queue_M_AXIS [get_bd_intf_pins store_0/store_queue_V_V] [get_bd_intf_pins store_queue/M_AXIS]
+connect_bd_intf_net -intf_net l2g_queue_M_AXIS [get_bd_intf_pins compute_0/l2g_dep_queue_V] [get_bd_intf_pins l2g_queue/M_AXIS]
+connect_bd_intf_net -intf_net g2l_queue_M_AXIS [get_bd_intf_pins g2l_queue/M_AXIS] [get_bd_intf_pins load_0/g2l_dep_queue_V]
+connect_bd_intf_net -intf_net g2s_queue_M_AXIS [get_bd_intf_pins g2s_queue/M_AXIS] [get_bd_intf_pins store_0/g2s_dep_queue_V]
+connect_bd_intf_net -intf_net s2g_queue_M_AXIS [get_bd_intf_pins compute_0/s2g_dep_queue_V] [get_bd_intf_pins s2g_queue/M_AXIS]
+connect_bd_intf_net -intf_net fetch_0_m_axi_ins_port [get_bd_intf_pins axi_smc0/S00_AXI] [get_bd_intf_pins fetch_0/m_axi_ins_port]
+connect_bd_intf_net -intf_net load_0_m_axi_data_port [get_bd_intf_pins axi_smc0/S01_AXI] [get_bd_intf_pins load_0/m_axi_data_port]
+connect_bd_intf_net -intf_net compute_0_m_axi_uop_port [get_bd_intf_pins axi_smc0/S02_AXI] [get_bd_intf_pins compute_0/m_axi_uop_port]
+connect_bd_intf_net -intf_net compute_0_m_axi_data_port [get_bd_intf_pins axi_smc0/S03_AXI] [get_bd_intf_pins compute_0/m_axi_data_port]
+connect_bd_intf_net -intf_net store_0_m_axi_data_port [get_bd_intf_pins axi_smc0/S04_AXI] [get_bd_intf_pins store_0/m_axi_data_port]
+connect_bd_intf_net -intf_net axi_smc0_M00_AXI [get_bd_intf_pins axi_smc0/M00_AXI] $saxi
+connect_bd_intf_net -intf_net processing_system_m_axi [get_bd_intf_pins axi_xbar/S00_AXI] $maxi
+
+# Create port connections
+connect_bd_net -net processing_system_reset \
+  [get_bd_pins pll_clk/resetn] \
+  [get_bd_pins proc_sys_reset/ext_reset_in] \
+  $ps_rstn
+connect_bd_net -net ps_clk_net \
+  [get_bd_pins pll_clk/clk_in1] \
+  $ps_clk
+connect_bd_net -net proc_sys_reset_interconnect_aresetn \
+  [get_bd_pins axi_xbar/ARESETN] \
+  [get_bd_pins proc_sys_reset/interconnect_aresetn]
+connect_bd_net -net proc_sys_reset_peripheral_aresetn \
+  [get_bd_pins proc_sys_reset/peripheral_aresetn] \
+  [get_bd_pins axi_smc0/aresetn] \
+  [get_bd_pins axi_xbar/M00_ARESETN] \
+  [get_bd_pins axi_xbar/M01_ARESETN] \
+  [get_bd_pins axi_xbar/M02_ARESETN] \
+  [get_bd_pins axi_xbar/M03_ARESETN] \
+  [get_bd_pins axi_xbar/S00_ARESETN] \
+  [get_bd_pins fetch_0/ap_rst_n] \
+  [get_bd_pins load_0/ap_rst_n] \
+  [get_bd_pins store_0/ap_rst_n] \
+  [get_bd_pins compute_0/ap_rst_n] \
+  [get_bd_pins load_queue/s_aresetn] \
+  [get_bd_pins gemm_queue/s_aresetn] \
+  [get_bd_pins store_queue/s_aresetn] \
+  [get_bd_pins l2g_queue/s_aresetn] \
+  [get_bd_pins g2l_queue/s_aresetn] \
+  [get_bd_pins g2s_queue/s_aresetn] \
+  [get_bd_pins s2g_queue/s_aresetn]
+connect_bd_net -net processing_system_clk \
+  [get_bd_pins pll_clk/clk_out1] \
+  [get_bd_pins proc_sys_reset/slowest_sync_clk] \
+  [get_bd_pins axi_smc0/aclk] \
+  [get_bd_pins axi_xbar/ACLK] \
+  [get_bd_pins axi_xbar/M00_ACLK] \
+  [get_bd_pins axi_xbar/M01_ACLK] \
+  [get_bd_pins axi_xbar/M02_ACLK] \
+  [get_bd_pins axi_xbar/M03_ACLK] \
+  [get_bd_pins axi_xbar/S00_ACLK] \
+  [get_bd_pins fetch_0/ap_clk] \
+  [get_bd_pins load_0/ap_clk] \
+  [get_bd_pins compute_0/ap_clk] \
+  [get_bd_pins store_0/ap_clk] \
+  [get_bd_pins load_queue/s_aclk] \
+  [get_bd_pins gemm_queue/s_aclk] \
+  [get_bd_pins store_queue/s_aclk] \
+  [get_bd_pins l2g_queue/s_aclk] \
+  [get_bd_pins g2l_queue/s_aclk] \
+  [get_bd_pins g2s_queue/s_aclk] \
+  [get_bd_pins s2g_queue/s_aclk] \
+  $maxi_clk \
+  $saxi_clk
+
+# Create address segments
+create_bd_addr_seg -range $ip_reg_map_range -offset $fetch_base_addr [get_bd_addr_spaces processing_system/Data] [get_bd_addr_segs fetch_0/s_axi_CONTROL_BUS/Reg] SEG_fetch_0_Reg
+create_bd_addr_seg -range $ip_reg_map_range -offset $load_base_addr [get_bd_addr_spaces processing_system/Data] [get_bd_addr_segs load_0/s_axi_CONTROL_BUS/Reg] SEG_load_0_Reg
+create_bd_addr_seg -range $ip_reg_map_range -offset $compute_base_addr [get_bd_addr_spaces processing_system/Data] [get_bd_addr_segs compute_0/s_axi_CONTROL_BUS/Reg] SEG_compute_0_Reg
+create_bd_addr_seg -range $ip_reg_map_range -offset $store_base_addr [get_bd_addr_spaces processing_system/Data] [get_bd_addr_segs store_0/s_axi_CONTROL_BUS/Reg] SEG_store_0_Reg
+if { $device_family eq "zynq-7000" } {
+  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 [get_bd_addr_spaces compute_0/Data_m_axi_uop_port] [get_bd_addr_segs processing_system/S_AXI_ACP/ACP_DDR_LOWOCM] SEG_processing_system_ACP_DDR_LOWOCM
+  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 [get_bd_addr_spaces compute_0/Data_m_axi_data_port] [get_bd_addr_segs processing_system/S_AXI_ACP/ACP_DDR_LOWOCM] SEG_processing_system_ACP_DDR_LOWOCM
+  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 [get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] [get_bd_addr_segs processing_system/S_AXI_ACP/ACP_DDR_LOWOCM] SEG_processing_system_ACP_DDR_LOWOCM
+  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 [get_bd_addr_spaces load_0/Data_m_axi_data_port] [get_bd_addr_segs processing_system/S_AXI_ACP/ACP_DDR_LOWOCM] SEG_processing_system_ACP_DDR_LOWOCM
+  create_bd_addr_seg -range 0x40000000 -offset 0x00000000 [get_bd_addr_spaces store_0/Data_m_axi_data_port] [get_bd_addr_segs processing_system/S_AXI_ACP/ACP_DDR_LOWOCM] SEG_processing_system_ACP_DDR_LOWOCM
+} elseif { $device_family eq "zynq-ultrascale+"} {
+  create_bd_addr_seg -range 0x80000000 -offset 0x00000000 [get_bd_addr_spaces fetch_0/Data_m_axi_ins_port] [get_bd_addr_segs processing_system/SAXIGP0/HPC0_DDR_LOW] SEG_processing_system_HPC0_DDR_LOW
+  create_bd_addr_seg -range 0x80000000 -offset 0x00000000 [get_bd_addr_spaces load_0/Data_m_axi_data_port] [get_bd_addr_segs processing_system/SAXIGP0/HPC0_DDR_LOW] SEG_processing_system_HPC0_DDR_LOW
+  create_bd_addr_seg -range 0x80000000 -offset 0x00000000 [get_bd_addr_spaces compute_0/Data_m_axi_uop_port] [get_bd_addr_segs processing_system/SAXIGP0/HPC0_DDR_LOW] SEG_processing_system_HPC0_DDR_LOW
+  create_bd_addr_seg -range 0x80000000 -offset 0x00000000 [get_bd_addr_spaces compute_0/Data_m_axi_data_port] [get_bd_addr_segs processing_system/SAXIGP0/HPC0_DDR_LOW] SEG_processing_system_HPC0_DDR_LOW
+  create_bd_addr_seg -range 0x80000000 -offset 0x00000000 [get_bd_addr_spaces store_0/Data_m_axi_data_port] [get_bd_addr_segs processing_system/SAXIGP0/HPC0_DDR_LOW] SEG_processing_system_HPC0_DDR_LOW
+}
+
+save_bd_design
 
 
 ##################################################################
-# MAIN FLOW
+# COMPILATION FLOW
 ##################################################################
 
-create_root_design "" $clock_id $inp_part $wgt_part $out_part $inp_bus_width \
-  $inp_mem_depth $wgt_bus_width $wgt_mem_depth $out_bus_width $out_mem_depth
-
 # Create top-level wrapper file
 make_wrapper -files \
   [get_files $proj_path/$proj_name.srcs/sources_1/bd/$proj_name/$proj_name.bd] -top
@@ -937,8 +421,7 @@ update_compile_order -fileset sources_1
 update_compile_order -fileset sim_1
 
 # Run bistream generation on 8 threads with performance oriented P&R strategy
-# create_run impl_1 -parent_run synth_1 -flow {Vivado Implementation 2017} \
-#   -strategy "Performance_ExplorePostRoutePhysOpt"
+set num_threads 8
 launch_runs impl_1 -to_step write_bitstream -jobs $num_threads
 wait_on_run impl_1
 
diff --git a/vta/hardware/xilinx/sim/vta_test.cc b/vta/hardware/xilinx/sim/vta_test.cc
index d21d121a8ddb..e3df31a9ddfe 100644
--- a/vta/hardware/xilinx/sim/vta_test.cc
+++ b/vta/hardware/xilinx/sim/vta_test.cc
@@ -35,17 +35,6 @@ int main(void) {
     printParameters();
 #endif
 
-    // Micro op bound
-    assert(VTA_UOP_GEM_2_1 < VTA_UOP_WIDTH);
-    assert(VTA_UOP_ALU_1_1 < VTA_UOP_WIDTH);
-    // Make sure there is no misaligment
-    assert(VTA_INSN_GEM_9_1 < VTA_INSN_GEM_A_0);
-    assert(VTA_INSN_MEM_7_1 < VTA_INSN_MEM_8_0);
-    // Instruction bounds
-    assert(VTA_INSN_MEM_E_1 < VTA_INS_WIDTH);
-    assert(VTA_INSN_GEM_F_1 < VTA_INS_WIDTH);
-    assert(VTA_INSN_ALU_G_1 < VTA_INS_WIDTH);
-
     int status = 0;
 
     // Run ALU test (vector-scalar operators)
@@ -65,15 +54,15 @@ int main(void) {
     status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, false);
     status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, true);
     status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_SHR, false, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_SHR, false, VTA_BLOCK_OUT, 128, false);
 
     // Run blocked GEMM test
-    status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2);
     status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2);
-    status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1);
     status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);
 
     // Simple GEMM unit test
-    status |= gemm_test(64, 64, 64, true);
+    status |= gemm_test(4 * VTA_BATCH, 4 * VTA_BLOCK_OUT, 4 * VTA_BLOCK_IN, false);
 
     return status;
 }
diff --git a/vta/hardware/xilinx/src/vta.cc b/vta/hardware/xilinx/src/vta.cc
index cb6b7b76cd11..fba9b4febcf8 100644
--- a/vta/hardware/xilinx/src/vta.cc
+++ b/vta/hardware/xilinx/src/vta.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2018 by Contributors
  * \file vta.cpp
  * \brief VTA HLS design.
  */
@@ -29,13 +28,114 @@
 
 #include "vta.h"
 
+template <typename DATA_T, int MAT_AXI_RATIO>
+void reset_mem(
+  memop_sram_T &sram_idx,
+  memop_sram_T range,
+  DATA_T mem[][MAT_AXI_RATIO]) {
+
+  for (int i = 0; i < range; i ++) {
+    for (int j = 0; j < MAT_AXI_RATIO; j ++) {
+#pragma HLS UNROLL
+      mem[sram_idx][j] = 0;
+    }
+    sram_idx ++;
+  }
+}
+
+template <typename DATA_T, int MAT_AXI_RATIO, int ELEM_BYTES>
+void load_pad_2d(
+  volatile DATA_T *src,
+  DATA_T dst[][MAT_AXI_RATIO],
+  memop_sram_T sram_idx,
+  memop_dram_T dram_idx,
+  memop_size_T y_size,
+  memop_size_T x_size,
+  memop_stride_T x_stride,
+  memop_pad_T x_pad_0,
+  memop_pad_T x_pad_1,
+  memop_sram_T y_offset_0,
+  memop_sram_T y_offset_1) {
+#pragma HLS INLINE
+
+  reset_mem<DATA_T, MAT_AXI_RATIO>(sram_idx, y_offset_0, dst);
+  for (int y = 0; y < y_size; y++) {
+#pragma HLS PIPELINE
+    reset_mem<DATA_T, MAT_AXI_RATIO>(sram_idx, x_pad_0, dst);
+    memcpy(&dst[sram_idx][0],
+           (const DATA_T*) &src[dram_idx * MAT_AXI_RATIO],
+           x_size * ELEM_BYTES);
+    sram_idx += x_size;
+    dram_idx += x_stride;
+    reset_mem<DATA_T, MAT_AXI_RATIO>(sram_idx, x_pad_1, dst);
+  }
+  reset_mem<DATA_T, MAT_AXI_RATIO>(sram_idx, y_offset_1, dst);
+}
+
+template <typename DATA_T, int MAT_AXI_RATIO, int ELEM_BYTES>
+void load_2d(
+  volatile DATA_T *src,
+  DATA_T dst[][MAT_AXI_RATIO],
+  memop_sram_T sram_idx,
+  memop_dram_T dram_idx,
+  memop_size_T y_size,
+  memop_size_T x_size,
+  memop_stride_T x_stride) {
+#pragma HLS INLINE
+
+  for (int y = 0; y < y_size; y++) {
+    memcpy(&dst[sram_idx][0],
+           (const DATA_T*) &src[dram_idx * MAT_AXI_RATIO],
+           x_size * ELEM_BYTES);
+#pragma HLS RESOURCE variable = sram_idx core = Mul_LUT
+    sram_idx += x_size;
+    dram_idx += x_stride;
+  }
+}
+
+template <typename WIDE_T, typename NARROW_T, typename IDX_T, int WIDE_W, int NARROW_W, int Y_DIM, int X_DIM>
+void read_tensor(
+  IDX_T idx,
+  WIDE_T src[][NARROW_W * Y_DIM * X_DIM / WIDE_W],
+  NARROW_T dst[Y_DIM][X_DIM]) {
+#pragma HLS INLINE
+
+  // Read in weight tensor
+  for (int p = 0; p < NARROW_W * Y_DIM * X_DIM / WIDE_W; p++) {
+    WIDE_T packet = src[idx][p];
+    for (int w = 0; w < (WIDE_W / NARROW_W); w++) {
+      int x = (p * (WIDE_W / NARROW_W) + w) / X_DIM;
+      int y = (p * (WIDE_W / NARROW_W) + w) % X_DIM;
+      dst[x][y] = (NARROW_T) packet.range((w + 1) * NARROW_W - 1, w * NARROW_W);
+    }
+  }
+}
+
+template <typename WIDE_T, typename NARROW_T, typename IDX_T, int WIDE_W, int NARROW_W, int Y_DIM, int X_DIM>
+void write_tensor(
+  IDX_T idx,
+  NARROW_T src[Y_DIM][X_DIM],
+  WIDE_T dst[][NARROW_W * Y_DIM * X_DIM / WIDE_W]) {
+#pragma HLS INLINE
+
+  for (int p = 0; p < NARROW_W * Y_DIM * X_DIM / WIDE_W; p++) {
+    WIDE_T packet = 0;
+    for (int w = 0; w < (WIDE_W / NARROW_W); w++) {
+      int x = (p * (WIDE_W / NARROW_W) + w) / X_DIM;
+      int y = (p * (WIDE_W / NARROW_W) + w) % X_DIM;
+      packet.range((w + 1) * NARROW_W - 1, w * NARROW_W) = src[x][y];
+    }
+    dst[idx][p] = packet;
+  }
+}
+
 void fetch(
   uint32_t insn_count,
   volatile insn_T *insns,
   hls::stream<insn_T> &load_queue,
   hls::stream<insn_T> &gemm_queue,
   hls::stream<insn_T> &store_queue) {
-#pragma HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS
+PRAGMA_HLS(HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS offset = VTA_FETCH_INSN_COUNT_OFFSET)
 #pragma HLS INTERFACE m_axi port = insns offset = slave bundle = ins_port
 #pragma HLS INTERFACE axis port = load_queue
 #pragma HLS INTERFACE axis port = gemm_queue
@@ -43,170 +143,288 @@ void fetch(
 #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
 
   INSN_DECODE: for (int pc = 0; pc < insn_count; pc++) {
-#pragma HLS PIPELINE II = 1
+#pragma HLS PIPELINE
     // Read instruction fields
-    insn_T insn = insns[pc];
+    insn_T raw_insn = insns[pc];
+    VTAInsn insn;
+    insn.generic = *((VTAGenericInsn *) &raw_insn);
     // Do some partial decoding
-    opcode_T opcode = insn.range(VTA_INSN_MEM_0_1, VTA_INSN_MEM_0_0);
-    memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0);
+    opcode_T opcode = insn.generic.opcode;
+    memop_id_T memory_type = insn.mem.memory_type;
     // Push to appropriate instruction queue
     if (opcode == VTA_OPCODE_STORE) {
-      store_queue.write(insn);
-    } else if (opcode == VTA_OPCODE_LOAD &&
-          (memory_type == VTA_MEM_ID_INP || memory_type == VTA_MEM_ID_WGT)) {
-      load_queue.write(insn);
+      store_queue.write(raw_insn);
+    } else if (opcode == VTA_OPCODE_LOAD) {
+      if (memory_type == VTA_MEM_ID_INP || memory_type == VTA_MEM_ID_WGT) {
+        load_queue.write(raw_insn);
+      } else {
+        gemm_queue.write(raw_insn);
+      }
     } else {
-      gemm_queue.write(insn);
+      gemm_queue.write(raw_insn);
     }
   }
 }
 
 void load(
-  volatile inp_vec_T *inputs,
-  volatile wgt_vec_T *weights,
+  volatile bus_T *inputs,
+  volatile bus_T *weights,
   hls::stream<insn_T> &load_queue,
   hls::stream<bool> &g2l_dep_queue,
   hls::stream<bool> &l2g_dep_queue,
-  inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
-  wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT]
-  ) {
-#pragma HLS INTERFACE m_axi port = weights offset = slave bundle = data_port
+  bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
+  bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO]) {
 #pragma HLS INTERFACE m_axi port = inputs offset = slave bundle = data_port
+#pragma HLS INTERFACE m_axi port = weights offset = slave bundle = data_port
 #pragma HLS INTERFACE axis port = load_queue
 #pragma HLS INTERFACE axis port = g2l_dep_queue
 #pragma HLS INTERFACE axis port = l2g_dep_queue
 #pragma HLS INTERFACE bram port = wgt_mem
 #pragma HLS INTERFACE bram port = inp_mem
 #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
+#pragma HLS RESOURCE variable = inp_mem core = RAM_1P
+#pragma HLS RESOURCE variable = wgt_mem core = RAM_1P
 
   // Pop load instruction
-  insn_T insn = load_queue.read();
-
-  // Decode instruction
-  bool pop_prev_dependence = insn[VTA_INSN_MEM_1];
-  bool pop_next_dependence = insn[VTA_INSN_MEM_2];
-  bool push_prev_dependence = insn[VTA_INSN_MEM_3];
-  bool push_next_dependence = insn[VTA_INSN_MEM_4];
-  memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0);
-  memop_sram_T sram_base = insn.range(VTA_INSN_MEM_6_1, VTA_INSN_MEM_6_0);
-  memop_dram_T dram_base = insn.range(VTA_INSN_MEM_7_1, VTA_INSN_MEM_7_0);
-  memop_size_T y_size = insn.range(VTA_INSN_MEM_8_1, VTA_INSN_MEM_8_0);
-  memop_size_T x_size = insn.range(VTA_INSN_MEM_9_1, VTA_INSN_MEM_9_0);
-  memop_stride_T x_stride = insn.range(VTA_INSN_MEM_A_1, VTA_INSN_MEM_A_0);
-  memop_pad_T y_pad_0 = insn.range(VTA_INSN_MEM_B_1, VTA_INSN_MEM_B_0);
-  memop_pad_T y_pad_1 = insn.range(VTA_INSN_MEM_C_1, VTA_INSN_MEM_C_0);
-  memop_pad_T x_pad_0 = insn.range(VTA_INSN_MEM_D_1, VTA_INSN_MEM_D_0);
-  memop_pad_T x_pad_1 = insn.range(VTA_INSN_MEM_E_1, VTA_INSN_MEM_E_0);
+  insn_T raw_insn = load_queue.read();
+  // Cast to MemInsn
+  insn_T raw_copy = raw_insn;
+  VTAMemInsn insn = *((VTAMemInsn *) &raw_copy);
 
   // Pop dependence token if instructed
-  if (pop_next_dependence) {
+  if (insn.pop_next_dep) {
     g2l_dep_queue.read();
   }
 
-  // Initialize indices
-  memop_sram_T sram_idx = sram_base;
-  memop_dram_T dram_idx = dram_base;
-
-  // Pre-compute dimensions, and offsets
-  memop_size_T y_size_total = y_pad_0 + y_size + y_pad_1;
-  memop_size_T x_size_total = x_pad_0 + x_size + x_pad_1;
-  memop_sram_T y_offset = x_size_total * y_pad_0;
-// Force this computation to be done with LUTs to avoid using too many DSPs
-#pragma HLS RESOURCE variable = y_offset core = Mul_LUT
-
-  // Skip padding along y dimension
-  sram_idx += y_offset;
+  // Pre-processing
+  memop_sram_T x_width = (insn.x_pad_0 + insn.x_size + insn.x_pad_1);
+  memop_sram_T y_offset_0 = x_width * insn.y_pad_0;
+#pragma HLS RESOURCE variable = y_offset_0 core = Mul_LUT latency = 4
+  memop_sram_T y_offset_1 = x_width * insn.y_pad_1;
+#pragma HLS RESOURCE variable = y_offset_1 core = Mul_LUT latency = 4
+
+  if (insn.memory_type == VTA_MEM_ID_INP) {
+    load_pad_2d<bus_T, INP_MAT_AXI_RATIO, VTA_INP_ELEM_BYTES>(
+        inputs,
+        inp_mem,
+        insn.sram_base,
+        insn.dram_base,
+        insn.y_size,
+        insn.x_size,
+        insn.x_stride,
+        insn.x_pad_0,
+        insn.x_pad_1,
+        y_offset_0,
+        y_offset_1);
+  } else if (insn.memory_type == VTA_MEM_ID_WGT) {
+    load_2d<bus_T, WGT_MAT_AXI_RATIO, VTA_WGT_ELEM_BYTES>(
+        weights,
+        wgt_mem,
+        insn.sram_base,
+        insn.dram_base,
+        insn.y_size,
+        insn.x_size,
+        insn.x_stride);
+  }
 
-  // Perform data transfer from DRAM
-  for (int y = 0; y < y_size; y++) {
-#pragma HLS PIPELINE rewind
-    // Skip padding along x dimension
-    sram_idx += x_pad_0;
-    // Perform data transfer
-    if (memory_type == VTA_MEM_ID_INP) {
-      memcpy(&inp_mem[sram_idx][0],
-             (const inp_vec_T*) &inputs[dram_idx * VTA_BATCH],
-             x_size * VTA_INP_ELEM_BYTES);
-    } else {
-      memcpy(&wgt_mem[sram_idx][0],
-             (const wgt_vec_T*) &weights[dram_idx * VTA_BLOCK_OUT],
-             x_size * VTA_WGT_ELEM_BYTES);
-    }
-    sram_idx += x_size;
-    dram_idx += x_stride;
-    // Skip padding along x dimension
-    sram_idx += x_pad_1;
+  // Push dependence token if instructed
+  if (insn.push_next_dep) {
+    l2g_dep_queue.write(1);
   }
+}
 
-  // Reset SRAM index
-  sram_idx = sram_base;
-  // Pad x/y edges with zeros
-  for (int y = 0; y < y_size_total; y++) {
-    if (y < y_pad_0 || y >= y_pad_0 + y_size) {
-      for (int x = 0; x < x_size_total; x++) {
-#pragma HLS PIPELINE II = 1 rewind
-        if (memory_type == VTA_MEM_ID_INP) {
-          for (int i = 0; i < VTA_BATCH; i++) {
-            inp_mem[sram_idx][i] = 0;
-          }
-        } else {
-          for (int i = 0; i < VTA_BLOCK_OUT; i++) {
-            wgt_mem[sram_idx][i] = 0;
-          }
-        }
-        sram_idx++;
-      }
-    } else {
-      for (int x = 0; x < x_pad_0; x++) {
-#pragma HLS PIPELINE II = 1 rewind
-        if (memory_type == VTA_MEM_ID_INP) {
-          for (int i = 0; i < VTA_BATCH; i++) {
-            inp_mem[sram_idx][i] = 0;
-          }
-        } else {
-          for (int i = 0; i < VTA_BLOCK_OUT; i++) {
-            wgt_mem[sram_idx][i] = 0;
+void gemm(
+  insn_T insn_raw,
+  uop_T uop_mem[VTA_UOP_BUFF_DEPTH],
+  bus_T acc_mem[VTA_ACC_BUFF_DEPTH][ACC_MAT_AXI_RATIO],
+  bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
+  bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO],
+  bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) {
+#pragma HLS INLINE
+
+  VTAGemInsn insn = *((VTAGemInsn *) &insn_raw);
+
+  // Loop offset
+  acc_idx_T dst_offset_out = 0;
+  inp_idx_T src_offset_out = 0;
+  wgt_idx_T wgt_offset_out = 0;
+
+  // Outer Loop
+  EXE_OUT_LOOP: for (int it_out = 0; it_out < insn.iter_out; it_out++) {
+    acc_idx_T dst_offset_in = dst_offset_out;
+    inp_idx_T src_offset_in = src_offset_out;
+    wgt_idx_T wgt_offset_in = wgt_offset_out;
+
+    // Inner Loop
+    EXE_IN_LOOP: for (int it_in = 0; it_in < insn.iter_in; it_in++) {
+
+      // Iterate over micro op
+      READ_GEMM_UOP: for (int upc = insn.uop_bgn; upc < insn.uop_end; upc++) {
+#pragma HLS PIPELINE II = 1
+        // Read micro-op fields
+        uop_T uop = uop_mem[upc];
+
+        // Decode indices
+        acc_idx_T dst_idx =
+            uop.range(VTA_UOP_GEM_0_1, VTA_UOP_GEM_0_0) + dst_offset_in;
+        inp_idx_T src_idx =
+            uop.range(VTA_UOP_GEM_1_1, VTA_UOP_GEM_1_0) + src_offset_in;
+        wgt_idx_T wgt_idx =
+            uop.range(VTA_UOP_GEM_2_1, VTA_UOP_GEM_2_0) + wgt_offset_in;
+
+        // Read in weight tensor
+        wgt_T w_tensor[VTA_BLOCK_OUT][VTA_BLOCK_IN];
+        read_tensor<bus_T, wgt_T, wgt_idx_T, VTA_BUS_WIDTH, VTA_WGT_WIDTH, VTA_BLOCK_OUT, VTA_BLOCK_IN>(wgt_idx, wgt_mem, w_tensor);
+        // Read in input tensor
+        inp_T i_tensor[VTA_BATCH][VTA_BLOCK_IN];
+        read_tensor<bus_T, inp_T, inp_idx_T, VTA_BUS_WIDTH, VTA_INP_WIDTH, VTA_BATCH, VTA_BLOCK_IN>(src_idx, inp_mem, i_tensor);
+        // Read in accum tensor
+        acc_T a_tensor[VTA_BATCH][VTA_BLOCK_OUT];
+        read_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, acc_mem, a_tensor);
+        // Output tensor
+        out_T o_tensor[VTA_BATCH][VTA_BLOCK_OUT];
+
+        // Inner GEMM loop
+        for (int b = 0; b < VTA_BATCH; b++) {
+          for (int oc = 0; oc < VTA_BLOCK_OUT; oc++) {
+            // Initialize the accumulator values
+            acc_T accum = a_tensor[b][oc];
+            // Dot product sum
+            sum_T tmp = 0;
+            // Inner matrix multiplication loop (input channel/feature)
+            for (int ic = 0; ic < VTA_BLOCK_IN; ic++) {
+              wgt_T w_elem = w_tensor[oc][ic];
+              inp_T i_elem = i_tensor[b][ic];
+              mul_T prod_dsp = i_elem * w_elem;
+              tmp += (sum_T) prod_dsp;
+            }
+            // Update summation
+            accum += (acc_T) tmp;
+            // Write back result acc_mem
+            a_tensor[b][oc] = insn.reset_reg ? (acc_T) 0 : accum;
+            // And output vector
+            o_tensor[b][oc] = (out_T) accum.range(VTA_OUT_WIDTH - 1, 0);
           }
         }
-        sram_idx++;
+
+        // Write the results back into accumulator
+        write_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, a_tensor, acc_mem);
+        // Write the results back in the output buffer
+        write_tensor<bus_T, out_T, acc_idx_T, VTA_BUS_WIDTH, VTA_OUT_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, o_tensor, out_mem);
       }
-      sram_idx += x_size;
-      for (int x = 0; x < x_pad_1; x++) {
-#pragma HLS PIPELINE II = 1 rewind
-        if (memory_type == VTA_MEM_ID_INP) {
-          for (int i = 0; i < VTA_BATCH; i++) {
-            inp_mem[sram_idx][i] = 0;
-          }
-        } else {
-          for (int i = 0; i < VTA_BLOCK_OUT; i++) {
-            wgt_mem[sram_idx][i] = 0;
+      // Update offsets
+      dst_offset_in += insn.dst_factor_in;
+      src_offset_in += insn.src_factor_in;
+      wgt_offset_in += insn.wgt_factor_in;
+    }
+    // Update offsets
+    dst_offset_out += insn.dst_factor_out;
+    src_offset_out += insn.src_factor_out;
+    wgt_offset_out += insn.wgt_factor_out;
+  }
+}
+
+void alu(
+  insn_T insn_raw,
+  uop_T uop_mem[VTA_UOP_BUFF_DEPTH],
+  bus_T acc_mem[VTA_ACC_BUFF_DEPTH][ACC_MAT_AXI_RATIO],
+  bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
+  bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO],
+  bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) {
+#pragma HLS INLINE
+
+  VTAAluInsn insn = *((VTAAluInsn *) &insn_raw);
+
+  // Loop offset
+  acc_idx_T dst_offset_out = 0;
+  inp_idx_T src_offset_out = 0;
+
+  // Outer Loop
+  EXE_OUT_LOOP: for (int it_out = 0; it_out < insn.iter_out; it_out++) {
+    acc_idx_T dst_offset_in = dst_offset_out;
+    inp_idx_T src_offset_in = src_offset_out;
+
+    // Inner Loop
+    EXE_IN_LOOP: for (int it_in = 0; it_in < insn.iter_in; it_in++) {
+      // Iterate over micro op
+      READ_ALU_UOP: for (int upc = insn.uop_bgn; upc < insn.uop_end; upc++) {
+#pragma HLS PIPELINE II = 2
+        // Read micro-op fields
+        uop_T uop = uop_mem[upc];
+
+        // Decode
+        acc_idx_T dst_idx =
+            uop.range(VTA_UOP_ALU_0_1, VTA_UOP_ALU_0_0) + dst_offset_in;
+        acc_idx_T src_idx =
+            uop.range(VTA_UOP_ALU_1_1, VTA_UOP_ALU_1_0) + src_offset_in;
+
+        // Read in src tensor
+        acc_T src_tensor[VTA_BATCH][VTA_BLOCK_OUT];
+        read_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(src_idx, acc_mem, src_tensor);
+        // Read in dst tensor
+        acc_T dst_tensor[VTA_BATCH][VTA_BLOCK_OUT];
+        read_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, acc_mem, dst_tensor);
+        // Output tensor
+        out_T o_tensor[VTA_BATCH][VTA_BLOCK_OUT];
+
+        // Perform ALU op over matrix elements
+        for (int i = 0; i < VTA_BATCH; i++) {
+          for (int b = 0; b < VTA_BLOCK_OUT; b++) {
+            // Read in operands
+            acc_T src_0 = dst_tensor[i][b];
+            acc_T src_1 = insn.use_imm ? (acc_T) insn.imm : src_tensor[i][b];
+            aluop_shr_arg_T shft_by = src_1.range(VTA_SHR_ARG_BIT_WIDTH - 1, 0);
+            aluop_mul_arg_T mul_by = src_1.range(VTA_MUL_ARG_BIT_WIDTH - 1, 0);
+            if (insn.alu_opcode == VTA_ALU_OPCODE_MIN || insn.alu_opcode == VTA_ALU_OPCODE_MAX) {
+              // Compute Min/Max
+              acc_T mix_val = src_0 < src_1 ?
+                  (insn.alu_opcode == VTA_ALU_OPCODE_MIN ? src_0 : src_1) :
+                  (insn.alu_opcode == VTA_ALU_OPCODE_MIN ? src_1 : src_0);
+              dst_tensor[i][b] = mix_val;
+              o_tensor[i][b] = (out_T) mix_val.range(VTA_OUT_WIDTH - 1, 0);
+            } else if (insn.alu_opcode == VTA_ALU_OPCODE_ADD) {
+              // Compute Sum
+              acc_T add_val =
+                  src_0.range(VTA_ACC_WIDTH - 1, 0) + src_1.range(VTA_ACC_WIDTH - 1, 0);
+              dst_tensor[i][b] = add_val;
+              o_tensor[i][b] = (out_T) add_val.range(VTA_OUT_WIDTH - 1, 0);
+            } else if (insn.alu_opcode == VTA_ALU_OPCODE_SHR) {
+              // Compute Shift Right
+              acc_T shr_val = src_0 >> shft_by;
+              dst_tensor[i][b] = shr_val;
+              o_tensor[i][b] = (out_T) shr_val.range(VTA_OUT_WIDTH - 1, 0);
+            }
           }
         }
-        sram_idx++;
+
+        // Write the results back into accumulator
+        write_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, dst_tensor, acc_mem);
+        // Write the results back in the output buffer
+        write_tensor<bus_T, out_T, acc_idx_T, VTA_BUS_WIDTH, VTA_OUT_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, o_tensor, out_mem);
       }
+      // Update offsets
+      dst_offset_in += insn.dst_factor_in;
+      src_offset_in += insn.src_factor_in;
     }
-  }
-
-  // Push dependence token if instructed
-  if (push_next_dependence) {
-    l2g_dep_queue.write(1);
+    // Update offsets
+    dst_offset_out += insn.dst_factor_out;
+    src_offset_out += insn.src_factor_out;
   }
 }
 
 void compute(
   volatile uint32_t &done,
   volatile uop_T *uops,
-  volatile acc_vec_T *biases,
+  volatile bus_T *biases,
   hls::stream<insn_T> &gemm_queue,
   hls::stream<bool> &l2g_dep_queue,
   hls::stream<bool> &s2g_dep_queue,
   hls::stream<bool> &g2l_dep_queue,
   hls::stream<bool> &g2s_dep_queue,
-  inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
-  wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT],
-  out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]
-  ) {
-#pragma HLS INTERFACE s_axilite port = done bundle = CONTROL_BUS
+  bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
+  bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO],
+  bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) {
+PRAGMA_HLS(HLS INTERFACE s_axilite port = done bundle = CONTROL_BUS offset = VTA_COMPUTE_DONE_WR_OFFSET)
 #pragma HLS INTERFACE m_axi port = uops offset = slave bundle = uop_port
 #pragma HLS INTERFACE m_axi port = biases offset = slave bundle = data_port
 #pragma HLS INTERFACE axis port = gemm_queue
@@ -218,351 +436,119 @@ void compute(
 #pragma HLS INTERFACE bram port = wgt_mem
 #pragma HLS INTERFACE bram port = out_mem
 #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
-// This is necessary connect the SRAM to the load module
+#pragma HLS RESOURCE variable = inp_mem core = RAM_1P
 #pragma HLS RESOURCE variable = wgt_mem core = RAM_1P
+#pragma HLS RESOURCE variable = out_mem core = RAM_1P
 
   // Micro-op storage
   static uop_T uop_mem[VTA_UOP_BUFF_DEPTH];
 
   // Accumulator storage
-  static acc_vec_T acc_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH];
-#pragma HLS ARRAY_PARTITION variable = acc_mem complete dim = 2
+  static bus_T acc_mem[VTA_ACC_BUFF_DEPTH][ACC_MAT_AXI_RATIO];
+#pragma HLS ARRAY_RESHAPE variable = acc_mem complete dim=2
+// This is necessary to obtain II=1
+#pragma HLS DEPENDENCE variable = acc_mem inter false
 
   // Pop GEMM instruction
-  insn_T insn = gemm_queue.read();
-
-  // Decode
-  opcode_T opcode = insn.range(VTA_INSN_MEM_0_1, VTA_INSN_MEM_0_0);
-  bool pop_prev_dependence = insn[VTA_INSN_MEM_1];
-  bool pop_next_dependence = insn[VTA_INSN_MEM_2];
-  bool push_prev_dependence = insn[VTA_INSN_MEM_3];
-  bool push_next_dependence = insn[VTA_INSN_MEM_4];
+  insn_T raw_insn = gemm_queue.read();
+  // Cast to GenericInsn
+  VTAInsn insn;
+  insn_T raw_copy = raw_insn;
+  insn.generic = *((VTAGenericInsn *) &raw_copy);
 
   // Pop dependence token if instructed
-  if (pop_prev_dependence) {
+  if (insn.generic.pop_prev_dep) {
     l2g_dep_queue.read();
   }
-  if (pop_next_dependence) {
+  if (insn.generic.pop_next_dep) {
     s2g_dep_queue.read();
   }
 
+  // Set done value
+  done = 0;
   // Perform action based on opcode
-  if (opcode == VTA_OPCODE_FINISH) {
+  if (insn.generic.opcode == VTA_OPCODE_FINISH) {
     // Set done flag if we reach a FINISH instruction
     done = 1;
-  } else if (opcode == VTA_OPCODE_LOAD || opcode == VTA_OPCODE_STORE) {
-    // Set done value
-    done = 0;
-
-    // Decode instruction
-    memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0);
-    memop_sram_T sram_base = insn.range(VTA_INSN_MEM_6_1, VTA_INSN_MEM_6_0);
-    memop_dram_T dram_base = insn.range(VTA_INSN_MEM_7_1, VTA_INSN_MEM_7_0);
-    memop_size_T y_size = insn.range(VTA_INSN_MEM_8_1, VTA_INSN_MEM_8_0);
-    memop_size_T x_size = insn.range(VTA_INSN_MEM_9_1, VTA_INSN_MEM_9_0);
-    memop_stride_T x_stride = insn.range(VTA_INSN_MEM_A_1, VTA_INSN_MEM_A_0);
-    memop_pad_T y_pad_0 = insn.range(VTA_INSN_MEM_B_1, VTA_INSN_MEM_B_0);
-    memop_pad_T y_pad_1 = insn.range(VTA_INSN_MEM_C_1, VTA_INSN_MEM_C_0);
-    memop_pad_T x_pad_0 = insn.range(VTA_INSN_MEM_D_1, VTA_INSN_MEM_D_0);
-    memop_pad_T x_pad_1 = insn.range(VTA_INSN_MEM_E_1, VTA_INSN_MEM_E_0);
-
+  } else if (insn.generic.opcode == VTA_OPCODE_LOAD) {
     // Initialize indices
-    memop_sram_T sram_idx = sram_base;
-    memop_dram_T dram_idx = dram_base;
-
-    // Pre-compute dimensions, and offsets
-    memop_size_T y_size_total = y_pad_0 + y_size + y_pad_1;
-    memop_size_T x_size_total = x_pad_0 + x_size + x_pad_1;
-    memop_sram_T y_offset = x_size_total * y_pad_0;
-// Force this computation to be done with LUTs to avoid using too many DSPs
-#pragma HLS RESOURCE variable = y_offset core = Mul_LUT
-
-    if (memory_type == VTA_MEM_ID_UOP) {
+    memop_sram_T sram_idx = insn.mem.sram_base;
+    memop_dram_T dram_idx = insn.mem.dram_base;
+    if (insn.mem.memory_type == VTA_MEM_ID_UOP) {
       // Perform data transfer
-      memcpy(&uop_mem[sram_base],
-             (const uop_T*) &uops[dram_base],
-             x_size * sizeof(uop_T));
-    } else {
-      // Skip vertical padding
-      sram_idx += y_offset;
+      memcpy(&uop_mem[sram_idx],
+             (const uop_T*) &uops[dram_idx],
+             insn.mem.x_size * sizeof(uop_T));
+    } else if (insn.mem.memory_type == VTA_MEM_ID_ACC) {
       // Perform data transfer from DRAM
-      for (int y = 0; y < y_size; y++) {
-#pragma HLS PIPELINE rewind
-        // Skip padding along x dimension
-        sram_idx += x_pad_0;
-        // Perform data transfer
-        memcpy(&acc_mem[sram_idx][0],
-               (const acc_vec_T*) &biases[dram_idx * VTA_BATCH],
-               x_size*VTA_ACC_ELEM_BYTES);
-        sram_idx += x_size;
-        dram_idx += x_stride;
-        // Skip padding along x dimension
-        sram_idx += x_pad_1;
-      }
-    }
-  } else if (opcode == VTA_OPCODE_GEMM || opcode == VTA_OPCODE_ALU) {
-    // Set done value
-    done = 0;
-
-    // Decode
-    bool reset_out = insn[VTA_INSN_GEM_5];
-    uop_idx_T uop_bgn = insn.range(VTA_INSN_GEM_6_1, VTA_INSN_GEM_6_0);
-    uop_idx_T uop_end = insn.range(VTA_INSN_GEM_7_1, VTA_INSN_GEM_7_0);
-    loop_T iter_out  = insn.range(VTA_INSN_GEM_8_1, VTA_INSN_GEM_8_0);
-    loop_T iter_in  = insn.range(VTA_INSN_GEM_9_1, VTA_INSN_GEM_9_0);
-    acc_idx_T dst_factor_out = insn.range(VTA_INSN_GEM_A_1, VTA_INSN_GEM_A_0);
-    acc_idx_T dst_factor_in = insn.range(VTA_INSN_GEM_B_1, VTA_INSN_GEM_B_0);
-    inp_idx_T src_factor_out = insn.range(VTA_INSN_GEM_C_1, VTA_INSN_GEM_C_0);
-    inp_idx_T src_factor_in = insn.range(VTA_INSN_GEM_D_1, VTA_INSN_GEM_D_0);
-
-    // GEMM-specific fields
-    wgt_idx_T wgt_factor_out = insn.range(VTA_INSN_GEM_E_1, VTA_INSN_GEM_E_0);
-    wgt_idx_T wgt_factor_in = insn.range(VTA_INSN_GEM_F_1, VTA_INSN_GEM_F_0);
-
-    // ALU-specific field
-    aluop_opcode_T alu_opcode = insn.range(VTA_INSN_ALU_E_1, VTA_INSN_ALU_E_0);
-    bool use_imm = insn[VTA_INSN_ALU_F];
-    aluop_imm_T imm = insn.range(VTA_INSN_ALU_G_1, VTA_INSN_ALU_G_0);
-    acc_idx_T dst_offset_out = 0;
-    inp_idx_T src_offset_out = 0;
-    wgt_idx_T wgt_offset_out = 0;
-
-    // Outer Loop
-    EXE_OUT_LOOP: for (int it_out = 0; it_out < iter_out; it_out++) {
-#pragma HLS DEPENDENCE variable = acc_mem inter false
-      acc_idx_T dst_offset_in = dst_offset_out;
-      inp_idx_T src_offset_in = src_offset_out;
-      wgt_idx_T wgt_offset_in = wgt_offset_out;
-
-      // Inner Loop
-      EXE_IN_LOOP: for (int it_in = 0; it_in < iter_in; it_in++) {
-        // Perform appropriate computation based on opcode
-        if (opcode == VTA_OPCODE_GEMM) {
-          // Iterate over micro op
-          READ_GEMM_UOP: for (int upc = uop_bgn; upc < uop_end; upc++) {
-#pragma HLS PIPELINE II = 1 rewind
-
-            // Read micro-op fields
-            uop_T uop = uop_mem[upc];
-
-            // Decode indices
-            acc_idx_T dst_idx =
-                uop.range(VTA_UOP_GEM_0_1, VTA_UOP_GEM_0_0) + dst_offset_in;
-            inp_idx_T src_idx =
-                uop.range(VTA_UOP_GEM_1_1, VTA_UOP_GEM_1_0) + src_offset_in;
-            wgt_idx_T wgt_idx =
-                uop.range(VTA_UOP_GEM_2_1, VTA_UOP_GEM_2_0) + wgt_offset_in;
-
-            // Read weight matrix
-            wgt_vec_T w_matrix[VTA_BLOCK_OUT];
-            for (int i = 0; i < VTA_BLOCK_OUT; i++) {
-              w_matrix[i] = wgt_mem[wgt_idx][i];
-            }
-            // Read input matrix and accum matrix
-            acc_vec_T o_matrix[VTA_BATCH];
-            inp_vec_T i_matrix[VTA_BATCH];
-            for (int i = 0; i < VTA_BATCH; i++) {
-              o_matrix[i] = acc_mem[dst_idx][i];
-              i_matrix[i] = inp_mem[src_idx][i];
-            }
-            // Result matrices
-            acc_vec_T acc_mem_val[VTA_BATCH];
-            out_vec_T st_buf_val[VTA_BATCH];
-
-            // Inner GEMM loop
-            for (int i = 0; i < VTA_BATCH; i++) {
-              for (int b = 0; b < VTA_BLOCK_OUT; b++) {
-                // Initialize the accumulator values
-                acc_T accum =
-                  o_matrix[i].range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH);
-                // Dot product sum
-                sum_T tmp = 0;
-                // Inner matrix multiplication loop (input channel/feature)
-                for (int k = 0; k < VTA_BLOCK_IN; k++) {
-                  wgt_T w_elem =
-                      w_matrix[b].range((k + 1) * VTA_WGT_WIDTH - 1, k * VTA_WGT_WIDTH);
-                  inp_T i_elem =
-                      i_matrix[i].range((k + 1) * VTA_INP_WIDTH - 1, k * VTA_INP_WIDTH);
-                  mul_T prod = i_elem * w_elem;
-#ifdef NO_DSP
-#pragma HLS RESOURCE variable = prod core = Mul_LUT
-#endif //  NO_DSP
-                  tmp += (sum_T) prod;
-                }
-                // Update summation
-                accum += (acc_T) tmp;
-                // Update result vector
-                acc_mem_val[i].range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) =
-                    reset_out ? (acc_T) 0 : accum;
-                st_buf_val[i].range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) =
-                    (out_T) accum.range(VTA_OUT_WIDTH - 1, 0);
-              }
-              // Write to buffers
-              acc_mem[dst_idx][i] = acc_mem_val[i];
-              out_mem[dst_idx][i] = st_buf_val[i];
-            }
-          }
-        }
-#ifndef NO_ALU
-        else if (opcode == VTA_OPCODE_ALU) {
-          // Iterate over micro op
-          READ_ALU_UOP: for (int upc = uop_bgn; upc < uop_end; upc++) {
-            // Read micro-op fields
-            uop_T uop = uop_mem[upc];
-
-            // Decode
-            acc_idx_T dst_idx =
-                uop.range(VTA_UOP_ALU_0_1, VTA_UOP_ALU_0_0) + dst_offset_in;
-            acc_idx_T src_idx =
-                uop.range(VTA_UOP_ALU_1_1, VTA_UOP_ALU_1_0) + src_offset_in;
-
-            // Perform ALU op over matrix elements
-            for (int i = 0; i < VTA_BATCH; i++) {
-              // Read input matrix and accum matrix
-              acc_vec_T dst_vector = acc_mem[dst_idx][i];
-              acc_vec_T src_vector = acc_mem[src_idx][i];
-              // Result matrices
-              acc_vec_T cmp_res;
-              acc_vec_T add_res;
-              acc_vec_T shr_res;
-              out_vec_T short_cmp_res;
-              out_vec_T short_add_res;
-              out_vec_T short_shr_res;
-              // Results vector
-              acc_vec_T res_vec = 0;
-              for (int b = 0; b < VTA_BLOCK_OUT; b++) {
-#pragma HLS PIPELINE II = 1 rewind
-                // Read in operands
-                acc_T src_0 = dst_vector.range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH);
-                acc_T src_1 = use_imm ?
-                    (acc_T) imm :
-                    src_vector.range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH);
-                // Compute Min/Max
-                acc_T mix_val = src_0 < src_1 ?
-                    (alu_opcode == VTA_ALU_OPCODE_MIN ? src_0 : src_1) :
-                    (alu_opcode == VTA_ALU_OPCODE_MIN ? src_1 : src_0);
-                cmp_res.range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) = mix_val;
-                short_cmp_res.range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) =
-                    (out_T) mix_val.range(VTA_OUT_WIDTH - 1, 0);
-                // Compute Sum
-                acc_T add_val =
-                    src_0.range(VTA_ACC_WIDTH - 1, 0) + src_1.range(VTA_ACC_WIDTH - 1, 0);
-                add_res.range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) = add_val;
-                short_add_res.range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) =
-                    (out_T) add_val.range(VTA_OUT_WIDTH - 1, 0);
-                // Compute Shift Right
-                acc_T shr_val =
-                    src_0 >> (aluop_sh_imm_T) src_1.range(VTA_LOG_ACC_WIDTH - 1, 0);
-                shr_res.range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) = shr_val;
-                short_shr_res.range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) =
-                    (out_T) shr_val.range(VTA_OUT_WIDTH-1, 0);
-              }
-
-              // Store to accum memory/store buffer
-              if (alu_opcode == VTA_ALU_OPCODE_MIN ||
-                  alu_opcode == VTA_ALU_OPCODE_MAX) {
-                acc_mem[dst_idx][i] = cmp_res;
-                out_mem[dst_idx][i] = short_cmp_res;
-              } else if (alu_opcode == VTA_ALU_OPCODE_ADD) {
-                acc_mem[dst_idx][i] = add_res;
-                out_mem[dst_idx][i] = short_add_res;
-              } else if (alu_opcode == VTA_ALU_OPCODE_SHR) {
-                acc_mem[dst_idx][i] = shr_res;
-                out_mem[dst_idx][i] = short_shr_res;
-              }
-            }
-          }
-        }
-#endif  // NO_ALU
-
-        // Update offsets
-        dst_offset_in += dst_factor_in;
-        src_offset_in += src_factor_in;
-        wgt_offset_in += wgt_factor_in;
-      }
-
-      // Update offsets
-      dst_offset_out += dst_factor_out;
-      src_offset_out += src_factor_out;
-      wgt_offset_out += wgt_factor_out;
+      load_2d<bus_T, ACC_MAT_AXI_RATIO, VTA_ACC_ELEM_BYTES>(
+          biases,
+          acc_mem,
+          sram_idx,
+          dram_idx,
+          insn.mem.y_size,
+          insn.mem.x_size,
+          insn.mem.x_stride);
     }
+  } else if (insn.generic.opcode == VTA_OPCODE_GEMM) {
+    gemm(raw_copy, uop_mem, acc_mem, inp_mem, wgt_mem, out_mem);
+  } else if (insn.generic.opcode == VTA_OPCODE_ALU) {
+    alu(raw_copy, uop_mem, acc_mem, inp_mem, wgt_mem, out_mem);
   }
 
   // Push dependence token if instructed
-  if (push_prev_dependence) {
+  if (insn.generic.push_prev_dep) {
     g2l_dep_queue.write(1);
   }
-  if (push_next_dependence) {
+  if (insn.generic.push_next_dep) {
     g2s_dep_queue.write(1);
   }
 }
 
 void store(
-  volatile out_vec_T *outputs,
+  volatile bus_T *outputs,
   hls::stream<insn_T> &store_queue,
   hls::stream<bool> &g2s_dep_queue,
   hls::stream<bool> &s2g_dep_queue,
-  out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]
-  ) {
+  bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) {
 #pragma HLS INTERFACE m_axi port = outputs offset = slave bundle = data_port
 #pragma HLS INTERFACE axis port = store_queue
 #pragma HLS INTERFACE axis port = g2s_dep_queue
 #pragma HLS INTERFACE axis port = s2g_dep_queue
 #pragma HLS INTERFACE bram port = out_mem
 #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
+#pragma HLS RESOURCE variable = out_mem core = RAM_1P
 
-  // Load buffer
-  insn_T insn = store_queue.read();
-
-  // Decode
-  bool pop_prev_dependence = insn[VTA_INSN_MEM_1];
-  bool pop_next_dependence = insn[VTA_INSN_MEM_2];
-  bool push_prev_dependence = insn[VTA_INSN_MEM_3];
-  bool push_next_dependence = insn[VTA_INSN_MEM_4];
-  memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0);
-  memop_sram_T sram_base = insn.range(VTA_INSN_MEM_6_1, VTA_INSN_MEM_6_0);
-  memop_dram_T dram_base = insn.range(VTA_INSN_MEM_7_1, VTA_INSN_MEM_7_0);
-  memop_size_T y_size = insn.range(VTA_INSN_MEM_8_1, VTA_INSN_MEM_8_0);
-  memop_size_T x_size = insn.range(VTA_INSN_MEM_9_1, VTA_INSN_MEM_9_0);
-  memop_stride_T x_stride = insn.range(VTA_INSN_MEM_A_1, VTA_INSN_MEM_A_0);
-  memop_pad_T y_pad_0 = insn.range(VTA_INSN_MEM_B_1, VTA_INSN_MEM_B_0);
-  memop_pad_T y_pad_1 = insn.range(VTA_INSN_MEM_C_1, VTA_INSN_MEM_C_0);
-  memop_pad_T x_pad_0 = insn.range(VTA_INSN_MEM_D_1, VTA_INSN_MEM_D_0);
-  memop_pad_T x_pad_1 = insn.range(VTA_INSN_MEM_E_1, VTA_INSN_MEM_E_0);
+  // Pop store instruction
+  insn_T raw_insn = store_queue.read();
+  // Cast to MemInsn
+  insn_T raw_copy = raw_insn;
+  VTAMemInsn insn = *((VTAMemInsn *) &raw_copy);
 
   // Pop dependence token if instructed
-  if (pop_prev_dependence) {
+  if (insn.pop_prev_dep) {
     g2s_dep_queue.read();
   }
 
   // Initialize indices
-  memop_sram_T sram_idx = sram_base;
-  memop_dram_T dram_idx = dram_base;
-
-  // Skip padding along y dimension
-  memop_sram_T y_offset = (x_pad_0 + x_size + x_pad_1) * y_pad_0;
-  sram_idx += y_offset;
-// Force this computation to be done with LUTs to avoid using too many DSPs
-#pragma HLS RESOURCE variable = y_offset core = Mul_LUT
+  memop_sram_T sram_idx = insn.sram_base;
+  memop_dram_T dram_idx = insn.dram_base;
 
   // Copy along y dimension
-  for (int y = 0; y < y_size; y++) {
-#pragma HLS PIPELINE rewind
-    // Skip padding along x dimension
-    sram_idx += x_pad_0;
+  for (int y = 0; y < insn.y_size; y++) {
+#pragma HLS PIPELINE
     // Perform data transfer
     memcpy(
-      const_cast<out_vec_T*>(&outputs[dram_idx*VTA_BATCH]),
-      (const out_vec_T*) &out_mem[sram_idx][0],
-      x_size * VTA_INP_ELEM_BYTES);
-    sram_idx += x_size;
-    dram_idx += x_stride;
-    // Skip padding along x dimension
-    sram_idx += x_pad_1;
+      const_cast<bus_T*>(&outputs[dram_idx * OUT_MAT_AXI_RATIO]),
+      (const bus_T*) &out_mem[sram_idx][0],
+      insn.x_size * VTA_OUT_ELEM_BYTES);
+#pragma HLS RESOURCE variable = sram_idx core = Mul_LUT
+    sram_idx += insn.x_size;
+    dram_idx += insn.x_stride;
   }
 
   // Push dependence token if instructed
-  if (push_prev_dependence) {
+  if (insn.push_prev_dep) {
     s2g_dep_queue.write(1);
   }
 }
@@ -571,10 +557,10 @@ void vta(
   uint32_t insn_count,
   volatile insn_T *insns,
   volatile uop_T *uops,
-  volatile inp_vec_T *inputs,
-  volatile wgt_vec_T *weights,
-  volatile acc_vec_T *biases,
-  volatile out_vec_T *outputs) {
+  volatile bus_T *inputs,
+  volatile bus_T *weights,
+  volatile bus_T *biases,
+  volatile bus_T *outputs) {
 #pragma HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS
 #pragma HLS INTERFACE m_axi port = insns offset = slave bundle = ins_port
 #pragma HLS INTERFACE m_axi port = uops offset = slave bundle = uop_port
@@ -606,14 +592,14 @@ void vta(
   hls::stream<bool> s2g_dep_queue;
   PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=s2g_dep_queue)
   hls::stream<bool> g2l_dep_queue;
-  PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=g2s_dep_queue)
+  PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=g2l_dep_queue)
   hls::stream<bool> g2s_dep_queue;
   PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=g2s_dep_queue)
 
   // Instantiate memories
-  inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH];
-  wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT];
-  out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH];
+  bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO];
+  bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO];
+  bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO];
 
   // Push all instructions into the queues
   fetch(insn_count, insns, tmp_load_queue, tmp_gemm_queue, tmp_store_queue);
@@ -642,9 +628,9 @@ void vta(
         tmp_load_popped = true;
       }
       // Check dependences and invoke the load stage
-      bool pop_next_dependence = tmp_load[VTA_INSN_MEM_2];
-      if ((pop_next_dependence && !g2l_dep_queue.empty()) ||
-          !pop_next_dependence) {
+      VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_load);
+      if ((insn.pop_next_dep && !g2l_dep_queue.empty()) ||
+          !insn.pop_next_dep) {
         // Push the instruction in the load queue
         load_queue.write(tmp_load);
         tmp_load_popped = false;
@@ -662,16 +648,15 @@ void vta(
         tmp_gemm_popped = true;
       }
       // Check dependences and invoke the load stage
-      bool pop_prev_dependence = tmp_gemv[VTA_INSN_MEM_1];
-      bool pop_next_dependence = tmp_gemv[VTA_INSN_MEM_2];
+      VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_gemv);
       if (
-        (pop_prev_dependence && !l2g_dep_queue.empty() &&
-         pop_next_dependence && !s2g_dep_queue.empty()) ||
-        (!pop_prev_dependence && pop_next_dependence &&
+        (insn.pop_prev_dep && !l2g_dep_queue.empty() &&
+         insn.pop_next_dep && !s2g_dep_queue.empty()) ||
+        (!insn.pop_prev_dep && insn.pop_next_dep &&
          !s2g_dep_queue.empty()) ||
-        (pop_prev_dependence && !l2g_dep_queue.empty() &&
-        !pop_next_dependence) ||
-        (!pop_prev_dependence && !pop_next_dependence)
+        (insn.pop_prev_dep && !l2g_dep_queue.empty() &&
+        !insn.pop_next_dep) ||
+        (!insn.pop_prev_dep && !insn.pop_next_dep)
       ) {
         // Push the instruction in the load queue
         gemm_queue.write(tmp_gemv);
@@ -692,9 +677,10 @@ void vta(
         tmp_store_popped = true;
       }
       // Check dependences and invoke the load stage
-      bool pop_prev_dependence = tmp_store[VTA_INSN_MEM_1];
-      if ((pop_prev_dependence && !g2s_dep_queue.empty()) ||
-          !pop_prev_dependence) {
+      VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_store);
+
+      if ((insn.pop_prev_dep && !g2s_dep_queue.empty()) ||
+          !insn.pop_prev_dep) {
         // Push the instruction in the load queue
         store_queue.write(tmp_store);
         tmp_store_popped = false;
@@ -716,10 +702,11 @@ void vta(
         }
       }
       if (tmp_gemm_popped) {
-        if (l2g_dep_queue.empty() && tmp_gemv[VTA_INSN_MEM_1]) {
+        VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_gemv);
+        if (l2g_dep_queue.empty() && insn.pop_prev_dep) {
           printf("waiting on l2g\n");
         }
-        if (s2g_dep_queue.empty() && tmp_gemv[VTA_INSN_MEM_2]) {
+        if (s2g_dep_queue.empty() && insn.pop_next_dep) {
           printf("waiting on s2g\n");
         }
       }
diff --git a/vta/hardware/xilinx/src/vta.h b/vta/hardware/xilinx/src/vta.h
index 1395d5eaba8e..d796e2265d4f 100644
--- a/vta/hardware/xilinx/src/vta.h
+++ b/vta/hardware/xilinx/src/vta.h
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2018 by Contributors
  * \file vta.h
  * \brief Type definitions and prototype for VTA HLS design.
  */
@@ -32,6 +31,16 @@
 
 #include <vta/hw_spec.h>
 
+/*!
+* Define HLS stream depth 
+*/
+#define PRAGMA_SUB(x) _Pragma (#x)
+#define PRAGMA_HLS(x) PRAGMA_SUB(x)
+#define STREAM_IN_DEPTH 8
+
+/* \typedef bus_T memory bus datatype*/
+typedef ap_uint<VTA_BUS_WIDTH> bus_T;
+
 /* \typedef uop_T Micro-op datatype*/
 typedef ap_uint<VTA_UOP_WIDTH> uop_T;
 
@@ -53,18 +62,6 @@ typedef ap_int<VTA_WGT_WIDTH+VTA_INP_WIDTH+1> mul_T;
 /* \typedef sum_T GEMM accumulator datatype*/
 typedef ap_int<VTA_WGT_WIDTH+VTA_INP_WIDTH+VTA_LOG_BLOCK_IN+1> sum_T;
 
-/* \typedef inp_vec_T Input vector datatype*/
-typedef ap_uint<VTA_INP_WIDTH*VTA_BLOCK_IN> inp_vec_T;
-
-/* \typedef wgt_vec_T Weight vector datatype*/
-typedef ap_uint<VTA_WGT_WIDTH*VTA_BLOCK_IN> wgt_vec_T;
-
-/* \typedef acc_vec_T Accumulator vector datatype*/
-typedef ap_uint<VTA_ACC_WIDTH*VTA_BLOCK_OUT> acc_vec_T;
-
-/* \typedef out_vec_T Output vector datatype*/
-typedef ap_uint<VTA_OUT_WIDTH*VTA_BLOCK_OUT> out_vec_T;
-
 /* \typedef uop_idx_T Micro-op SRAM index datatype*/
 typedef ap_uint<VTA_LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
 
@@ -107,18 +104,14 @@ typedef ap_uint<VTA_MEMOP_PAD_BIT_WIDTH> memop_pad_T;
 /* \typedef aluop_opcode_T ALU operation opcode datatype*/
 typedef ap_uint<VTA_ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
 
-/* \typedef aluop_opcode_T ALU operation immediate datatype*/
+/* \typedef aluop_imm_T ALU operation immediate datatype*/
 typedef ap_int<VTA_ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
 
-/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
-typedef ap_int<VTA_LOG_ACC_WIDTH> aluop_sh_imm_T;
+/* \typedef aluop_shr_arg_T ALU operation shift right immediate datatype*/
+typedef ap_int<VTA_SHR_ARG_BIT_WIDTH> aluop_shr_arg_T;
 
-/*!
-* Define HLS stream depth 
-*/
-#define PRAGMA_SUB(x) _Pragma (#x)
-#define PRAGMA_HLS(x) PRAGMA_SUB(x)
-#define STREAM_IN_DEPTH 8
+/* \typedef aluop_mul_arg_T ALU operation multiply datatype*/
+typedef ap_int<VTA_MUL_ARG_BIT_WIDTH> aluop_mul_arg_T;
 
 /*!
 * \brief Fetch module.
@@ -153,13 +146,13 @@ void fetch(
 * \param wgt_mem Local weight SRAM buffer. Write only single port BRAM.
 */
 void load(
-  volatile inp_vec_T *inputs,
-  volatile wgt_vec_T *weights,
+  volatile bus_T *inputs,
+  volatile bus_T *weights,
   hls::stream<insn_T> &load_queue,
   hls::stream<bool> &g2l_dep_queue,
   hls::stream<bool> &l2g_dep_queue,
-  inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
-  wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT]);
+  bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
+  bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO]);
 
 /*!
 * \brief Compute module.
@@ -187,15 +180,15 @@ void load(
 void compute(
   volatile uint32_t &done,
   volatile uop_T *uops,
-  volatile acc_vec_T *biases,
+  volatile bus_T *biases,
   hls::stream<insn_T> &gemm_queue,
   hls::stream<bool> &l2g_dep_queue,
   hls::stream<bool> &s2g_dep_queue,
   hls::stream<bool> &g2l_dep_queue,
   hls::stream<bool> &g2s_dep_queue,
-  out_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
-  wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT],
-  out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]);
+  bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
+  bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO],
+  bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]);
 
 /*!
 * \brief Store module.
@@ -211,11 +204,11 @@ void compute(
 * \param out_mem Local output SRAM buffer. Read only single port BRAM.
 */
 void store(
-  volatile out_vec_T *outputs,
+  volatile bus_T *outputs,
   hls::stream<insn_T> &store_queue,
   hls::stream<bool> &g2s_dep_queue,
   hls::stream<bool> &s2g_dep_queue,
-  out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]);
+  bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]);
 
 /*!
 * \brief VTA wrapper for simulation purpose only.
@@ -232,9 +225,9 @@ void vta(
   uint32_t insn_count,
   volatile insn_T *insns,
   volatile uop_T *uops,
-  volatile inp_vec_T *inputs,
-  volatile wgt_vec_T *weights,
-  volatile acc_vec_T *biases,
-  volatile out_vec_T *outputs);
+  volatile bus_T *inputs,
+  volatile bus_T *weights,
+  volatile bus_T *biases,
+  volatile bus_T *outputs);
 
 #endif  // VTA_VTA_H_
diff --git a/vta/include/vta/driver.h b/vta/include/vta/driver.h
index a6f5fd27f528..410a2b24a090 100644
--- a/vta/include/vta/driver.h
+++ b/vta/include/vta/driver.h
@@ -136,19 +136,23 @@ void VTAMemCopyToHost(void* dst, const void* src, size_t size);
 
 /*!
  * \brief Flushes the region of memory out of the CPU cache to DRAM.
- * \param buf Pointer to memory region allocated with VTAMemAlloc to be flushed.
- *            This need to be the physical address.
+ * \param vir_addr Pointer to memory region allocated with VTAMemAlloc to be flushed.
+ *                 This need to be the virtual address.
+ * \param phy_addr Pointer to memory region allocated with VTAMemAlloc to be flushed.
+ *                 This need to be the physical address.
  * \param size Size of the region to flush in Bytes.
  */
-void VTAFlushCache(vta_phy_addr_t buf, int size);
+void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size);
 
 /*!
  * \brief Invalidates the region of memory that is cached.
- * \param buf Pointer to memory region allocated with VTAMemAlloc to be invalidated.
- *            This need to be the physical address.
+ * \param vir_addr Pointer to memory region allocated with VTAMemAlloc to be invalidated.
+ *                 This need to be the virtual address.
+ * \param phy_addr Pointer to memory region allocated with VTAMemAlloc to be invalidated.
+ *                 This need to be the physical address.
  * \param size Size of the region to invalidate in Bytes.
  */
-void VTAInvalidateCache(vta_phy_addr_t buf, int size);
+void VTAInvalidateCache(void* vir_addr, vta_phy_addr_t phy_addr, int size);
 
 #ifdef __cplusplus
 }
diff --git a/vta/include/vta/hw_spec.h b/vta/include/vta/hw_spec.h
index 36b0a5d26b3b..9751b2f137c8 100644
--- a/vta/include/vta/hw_spec.h
+++ b/vta/include/vta/hw_spec.h
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2018 by Contributors
  * \file hw_spec.h
  * \brief Preprocessor definitions for VTA HLS design and runtime.
  */
@@ -32,6 +31,9 @@ extern "C" {
 
 #include <stdint.h>
 
+/*! Memory bus width */
+#define VTA_BUS_WIDTH (1 << VTA_LOG_BUS_WIDTH)
+
 /*! log2 of instruction data type width */
 #define VTA_LOG_INS_WIDTH 7
 /*! Instruction data type width */
@@ -48,10 +50,6 @@ extern "C" {
 #define VTA_OUT_WIDTH (1 << VTA_LOG_OUT_WIDTH)
 /*! Accumulator data type width */
 #define VTA_ACC_WIDTH (1 << VTA_LOG_ACC_WIDTH)
-/*! log2 of ALU data type width */
-#define VTA_LOG_ALU_WIDTH (VTA_LOG_ACC_WIDTH - 1)
-/*! ALU data type width */
-#define VTA_ALU_WIDTH (1 << VTA_LOG_ALU_WIDTH)
 
 /*! Batch size (corresponds to A in (A,B)x(B,C) mat mult)*/
 #define VTA_BATCH (1 << VTA_LOG_BATCH)
@@ -60,15 +58,6 @@ extern "C" {
 /*! Blocking factor of the outer loop (corresponds to C in (A,B)x(B,C) mat mult) */
 #define VTA_BLOCK_OUT (1 << VTA_LOG_BLOCK_OUT)
 
-/*! Weight vector width */
-#define VTA_WGT_VECTOR_WIDTH (VTA_WGT_WIDTH * VTA_BLOCK_IN)
-/*! Input vector width */
-#define VTA_INP_VECTOR_WIDTH (VTA_INP_WIDTH * VTA_BLOCK_IN)
-/*! Accumulator vector width */
-#define VTA_ACC_VECTOR_WIDTH (VTA_ACC_WIDTH * VTA_BLOCK_OUT)
-/*! Output vector width */
-#define VTA_OUT_VECTOR_WIDTH (VTA_OUT_WIDTH * VTA_BLOCK_OUT)
-
 /*! On-chip micro-op buffer size in B */
 #define VTA_UOP_BUFF_SIZE (1 << VTA_LOG_UOP_BUFF_SIZE)
 /*! On-chip weight buffer size in B */
@@ -78,16 +67,36 @@ extern "C" {
 /*! On-chip accumulator buffer size in B */
 #define VTA_ACC_BUFF_SIZE (1 << VTA_LOG_ACC_BUFF_SIZE)
 
+/*! Input vector size in bits */
+#define VTA_INP_MATRIX_WIDTH (VTA_INP_WIDTH * VTA_BATCH * VTA_BLOCK_IN)
+/*! Weight vector size in bits */
+#define VTA_WGT_MATRIX_WIDTH (VTA_WGT_WIDTH * VTA_BLOCK_OUT * VTA_BLOCK_IN)
+/*! Accumulator vector size in bits */
+#define VTA_ACC_MATRIX_WIDTH (VTA_ACC_WIDTH * VTA_BATCH * VTA_BLOCK_OUT)
+/*! Output vector size in bits */
+#define VTA_OUT_MATRIX_WIDTH (VTA_OUT_WIDTH * VTA_BATCH * VTA_BLOCK_OUT)
+
+/*! Ratio between input matrix size and axi width */
+#define INP_MAT_AXI_RATIO (VTA_INP_MATRIX_WIDTH / VTA_BUS_WIDTH)
+/*! Ratio between weight matrix size and axi width */
+#define WGT_MAT_AXI_RATIO (VTA_WGT_MATRIX_WIDTH / VTA_BUS_WIDTH)
+/*! Ratio between accumulator matrix size and axi width */
+#define ACC_MAT_AXI_RATIO (VTA_ACC_MATRIX_WIDTH / VTA_BUS_WIDTH)
+/*! Ratio between output matrix size and axi width */
+#define OUT_MAT_AXI_RATIO (VTA_OUT_MATRIX_WIDTH / VTA_BUS_WIDTH)
+
 /*! Size of instruction buffer element in B */
 #define VTA_INS_ELEM_BYTES (VTA_INS_WIDTH / 8)
 /*! Size of uop buffer element in B*/
 #define VTA_UOP_ELEM_BYTES (VTA_UOP_WIDTH / 8)
 /*! Size of activation buffer element in B*/
-#define VTA_INP_ELEM_BYTES (VTA_BATCH * VTA_BLOCK_IN * VTA_INP_WIDTH / 8)
+#define VTA_INP_ELEM_BYTES (VTA_INP_MATRIX_WIDTH / 8)
 /*! Size of weight buffer element in B*/
-#define VTA_WGT_ELEM_BYTES (VTA_BLOCK_OUT * VTA_BLOCK_IN * VTA_WGT_WIDTH / 8)
+#define VTA_WGT_ELEM_BYTES (VTA_WGT_MATRIX_WIDTH / 8)
 /*! Size of accumulator buffer element in B*/
-#define VTA_ACC_ELEM_BYTES (VTA_BATCH * VTA_BLOCK_OUT * VTA_ACC_WIDTH / 8)
+#define VTA_ACC_ELEM_BYTES (VTA_ACC_MATRIX_WIDTH / 8)
+/*! Size of output buffer element in B*/
+#define VTA_OUT_ELEM_BYTES (VTA_OUT_MATRIX_WIDTH / 8)
 
 /*! On-chip micro-op buffer depth */
 #define VTA_UOP_BUFF_DEPTH (VTA_UOP_BUFF_SIZE / VTA_UOP_ELEM_BYTES)
@@ -148,10 +157,14 @@ extern "C" {
 #define VTA_MEMOP_PAD_BIT_WIDTH 4
 /*! Load/Store Instruction: padding value encoding width*/
 #define VTA_MEMOP_PAD_VAL_BIT_WIDTH 2
-/*! ALU Instruction: immediate bitwidth*/
-#define VTA_ALUOP_IMM_BIT_WIDTH 16
 /*! GEMM/ALU Instruction: loop max iter bits */
 #define VTA_LOOP_ITER_WIDTH 14
+/*! ALU Instruction: immediate bitwidth*/
+#define VTA_ALUOP_IMM_BIT_WIDTH 16
+/*! ALU Instruction: shift arg bitwidth*/
+#define VTA_SHR_ARG_BIT_WIDTH (VTA_LOG_ACC_WIDTH)
+/*! ALU Instruction: multiply arg bitwidth*/
+#define VTA_MUL_ARG_BIT_WIDTH 8
 
 /*! Mem ID constant: uop memory */
 #define VTA_MEM_ID_UOP 0
@@ -164,186 +177,6 @@ extern "C" {
 /*! Mem ID constant: output store buffer */
 #define VTA_MEM_ID_OUT 4
 
-// Instruction organization layout:
-//
-// LOAD/STORE
-// _____________________________|_type______________|
-// arg 0: opcode                | opcode_T          |
-// arg 1: pop_prev_dependence   | bool              |
-// arg 2: pop_next_dependence   | bool              |
-// arg 3: push_prev_dependence  | bool              |
-// arg 4: push_next_dependence  | bool              |
-// arg 5: memory_type           | memop_id_T        |
-// arg 6: pad_value             | memop_pad_val_T   |
-// arg 7: sram_base             | memop_sram_T      |
-// arg 8: dram_base             | memop_dram_T      |
-// arg 9: y_size                | memop_size_T      |
-// arg a: x_size                | memop_size_T      |
-// arg b: x_stride              | memop_stride_T    |
-// arg c: y_pad_0               | memop_pad_T       |
-// arg d: y_pad_1               | memop_pad_T       |
-// arg e: x_pad_0               | memop_pad_T       |
-// arg f: x_pad_1               | memop_pad_T       |
-//
-// GEMM
-// _____________________________|_type______________|
-// arg 0: opcode                | opcode_T          |
-// arg 1: pop_prev_dependence   | bool              |
-// arg 2: pop_next_dependence   | bool              |
-// arg 3: push_prev_dependence  | bool              |
-// arg 4: push_next_dependence  | bool              |
-// arg 5: reset_reg             | bool              |
-// arg 6: uop_bgn               | uop_idx_T         |
-// arg 7: uop_end               | uop_idx_T         |
-// arg 8: iteration count ax0   | loop_T            |
-// arg 9: iteration count ax1   | loop_T            |
-// arg a: accum idx factor ax0  | acc_idx_T         |
-// arg b: accum idx factor ax1  | acc_idx_T         |
-// arg c: input idx factor ax0  | inp_idx_T         |
-// arg d: input idx factor ax1  | inp_idx_T         |
-// arg e: weight idx factor ax0 | wgt_idx_T         |
-// arg f: weight idx factor ax1 | wgt_idx_T         |
-//
-// ALU
-// _____________________________|_type______________|
-// arg 0: opcode                | opcode_T          |
-// arg 1: pop_prev_dependence   | bool              |
-// arg 2: pop_next_dependence   | bool              |
-// arg 3: push_prev_dependence  | bool              |
-// arg 4: push_next_dependence  | bool              |
-// arg 5: reset_reg             | bool              |
-// arg 6: uop_bgn               | uop_idx_T         |
-// arg 7: uop_end               | uop_idx_T         |
-// arg 8: iteration count ax0   | loop_T            |
-// arg 9: iteration count ax1   | loop_T            |
-// arg a: dst idx factor ax0    | acc_idx_T         |
-// arg b: dst idx factor ax1    | acc_idx_T         |
-// arg c: src idx factor ax0    | inp_idx_T         |
-// arg d: src idx factor ax1    | inp_idx_T         |
-// arg e: alu_opcode            | aluop_opcode_T    |
-// arg f: use_imm               | bool              |
-// arg g: imm                   | alu_imm_T         |
-
-/*! Load/Store instruction start position of the opcode field */
-#define VTA_INSN_MEM_0_0 0
-/*! Load/Store instruction end position of the opcode field */
-#define VTA_INSN_MEM_0_1 (VTA_INSN_MEM_0_0 + VTA_OPCODE_BIT_WIDTH - 1)
-/*! Load/Store instruction position of the pop_prev_dep field */
-#define VTA_INSN_MEM_1   (VTA_INSN_MEM_0_1 + 1)
-/*! Load/Store instruction position of the pop_next_dep field */
-#define VTA_INSN_MEM_2   (VTA_INSN_MEM_1 + 1)
-/*! Load/Store instruction position of the push_prev_dependence field */
-#define VTA_INSN_MEM_3   (VTA_INSN_MEM_2 + 1)
-/*! Load/Store instruction position of the push_next_dependence field */
-#define VTA_INSN_MEM_4   (VTA_INSN_MEM_3 + 1)
-/*! Load/Store instruction start position of the memory_type field */
-#define VTA_INSN_MEM_5_0 (VTA_INSN_MEM_4 + 1)
-/*! Load/Store instruction end position of the memory_type field */
-#define VTA_INSN_MEM_5_1 (VTA_INSN_MEM_5_0 + VTA_MEMOP_ID_BIT_WIDTH - 1)
-/*! Load/Store instruction start position of the sram_base field */
-#define VTA_INSN_MEM_6_0 (VTA_INSN_MEM_5_1 + 1)
-/*! Load/Store instruction end position of the sram_base field */
-#define VTA_INSN_MEM_6_1 (VTA_INSN_MEM_6_0 + VTA_MEMOP_SRAM_ADDR_BIT_WIDTH - 1)
-/*! Load/Store instruction start position of the dram_base field */
-#define VTA_INSN_MEM_7_0 (VTA_INSN_MEM_6_1 + 1)
-/*! Load/Store instruction end position of the dram_base field */
-#define VTA_INSN_MEM_7_1 (VTA_INSN_MEM_7_0 + VTA_MEMOP_DRAM_ADDR_BIT_WIDTH - 1)
-/*! Load/Store instruction start position of the y_size field */
-#define VTA_INSN_MEM_8_0 64
-/*! Load/Store instruction end position of the y_size field */
-#define VTA_INSN_MEM_8_1 (VTA_INSN_MEM_8_0 + VTA_MEMOP_SIZE_BIT_WIDTH - 1)
-/*! Load/Store instruction start position of the x_size field */
-#define VTA_INSN_MEM_9_0 (VTA_INSN_MEM_8_1 + 1)
-/*! Load/Store instruction start position of the x_size field */
-#define VTA_INSN_MEM_9_1 (VTA_INSN_MEM_9_0 + VTA_MEMOP_SIZE_BIT_WIDTH - 1)
-/*! Load/Store instruction start position of the x_stride field */
-#define VTA_INSN_MEM_A_0 (VTA_INSN_MEM_9_1 + 1)
-/*! Load/Store instruction end position of the x_stride field */
-#define VTA_INSN_MEM_A_1 (VTA_INSN_MEM_A_0 + VTA_MEMOP_STRIDE_BIT_WIDTH - 1)
-/*! Load/Store instruction start position of the y_pad_0 field */
-#define VTA_INSN_MEM_B_0 (VTA_INSN_MEM_A_1 + 1)
-/*! Load/Store instruction start position of the y_pad_0 field */
-#define VTA_INSN_MEM_B_1 (VTA_INSN_MEM_B_0 + VTA_MEMOP_PAD_BIT_WIDTH - 1)
-/*! Load/Store instruction start position of the y_pad_1 field */
-#define VTA_INSN_MEM_C_0 (VTA_INSN_MEM_B_1 + 1)
-/*! Load/Store instruction start position of the y_pad_1 field */
-#define VTA_INSN_MEM_C_1 (VTA_INSN_MEM_C_0 + VTA_MEMOP_PAD_BIT_WIDTH - 1)
-/*! Load/Store instruction start position of the x_pad_0 field */
-#define VTA_INSN_MEM_D_0 (VTA_INSN_MEM_C_1 + 1)
-/*! Load/Store instruction start position of the x_pad_0 field */
-#define VTA_INSN_MEM_D_1 (VTA_INSN_MEM_D_0 + VTA_MEMOP_PAD_BIT_WIDTH - 1)
-/*! Load/Store instruction start position of the x_pad_1 field */
-#define VTA_INSN_MEM_E_0 (VTA_INSN_MEM_D_1 + 1)
-/*! Load/Store instruction start position of the x_pad_1 field */
-#define VTA_INSN_MEM_E_1 (VTA_INSN_MEM_E_0 + VTA_MEMOP_PAD_BIT_WIDTH - 1)
-
-/*! GEMM instruction start position of the opcode field */
-#define VTA_INSN_GEM_0_0 0
-/*! GEMM instruction end position of the opcode field */
-#define VTA_INSN_GEM_0_1 (VTA_INSN_GEM_0_0 + VTA_OPCODE_BIT_WIDTH - 1)
-/*! GEMM instruction position of the pop_prev_dep field */
-#define VTA_INSN_GEM_1   (VTA_INSN_GEM_0_1 + 1)
-/*! GEMM instruction position of the pop_next_dep field */
-#define VTA_INSN_GEM_2   (VTA_INSN_GEM_1 + 1)
-/*! GEMM instruction position of the push_prev_dependence field */
-#define VTA_INSN_GEM_3   (VTA_INSN_GEM_2 + 1)
-/*! GEMM instruction position of the push_next_dependence field */
-#define VTA_INSN_GEM_4   (VTA_INSN_GEM_3 + 1)
-/*! GEMM instruction position of the reset register bit */
-#define VTA_INSN_GEM_5   (VTA_INSN_GEM_4 + 1)
-/*! GEMM instruction start position of the uop_bgn field */
-#define VTA_INSN_GEM_6_0 (VTA_INSN_GEM_5 + 1)
-/*! GEMM instruction end position of the uop_bgn field */
-#define VTA_INSN_GEM_6_1 (VTA_INSN_GEM_6_0 + VTA_LOG_UOP_BUFF_DEPTH - 1)
-/*! GEMM instruction start position of the uop_end field */
-#define VTA_INSN_GEM_7_0 (VTA_INSN_GEM_6_1 + 1)
-/*! GEMM instruction end position of the uop_end field */
-#define VTA_INSN_GEM_7_1 (VTA_INSN_GEM_7_0 + VTA_LOG_UOP_BUFF_DEPTH + 1 - 1)
-/*! GEMM instruction start position of the iter_out field */
-#define VTA_INSN_GEM_8_0 (VTA_INSN_GEM_7_1 + 1)
-/*! GEMM instruction end position of the iter_out field */
-#define VTA_INSN_GEM_8_1 (VTA_INSN_GEM_8_0 + VTA_LOOP_ITER_WIDTH - 1)
-/*! GEMM instruction start position of the iter_in field */
-#define VTA_INSN_GEM_9_0 (VTA_INSN_GEM_8_1 + 1)
-/*! GEMM instruction end position of the iter_in field */
-#define VTA_INSN_GEM_9_1 (VTA_INSN_GEM_9_0 + VTA_LOOP_ITER_WIDTH - 1)
-/*! GEMM instruction start position of the dst_factor_out field */
-#define VTA_INSN_GEM_A_0 64
-/*! GEMM instruction end position of the dst_factor_out field */
-#define VTA_INSN_GEM_A_1 (VTA_INSN_GEM_A_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
-/*! GEMM instruction start position of the dst_factor_in field */
-#define VTA_INSN_GEM_B_0 (VTA_INSN_GEM_A_1 + 1)
-/*! GEMM instruction end position of the dst_factor_in field */
-#define VTA_INSN_GEM_B_1 (VTA_INSN_GEM_B_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
-/*! GEMM instruction start position of the src_factor_out field */
-#define VTA_INSN_GEM_C_0 (VTA_INSN_GEM_B_1 + 1)
-/*! GEMM instruction end position of the src_factor_out field */
-#define VTA_INSN_GEM_C_1 (VTA_INSN_GEM_C_0 + VTA_LOG_INP_BUFF_DEPTH - 1)
-/*! GEMM instruction start position of the src_factor_in field */
-#define VTA_INSN_GEM_D_0 (VTA_INSN_GEM_C_1 + 1)
-/*! GEMM instruction end position of the src_factor_in field */
-#define VTA_INSN_GEM_D_1 (VTA_INSN_GEM_D_0 + VTA_LOG_INP_BUFF_DEPTH - 1)
-
-/*! GEMM instruction start position of the wgt_factor_out field */
-#define VTA_INSN_GEM_E_0 (VTA_INSN_GEM_D_1 + 1)
-/*! GEMM instruction end position of the wgt_factor_out field */
-#define VTA_INSN_GEM_E_1 (VTA_INSN_GEM_E_0 + VTA_LOG_WGT_BUFF_DEPTH - 1)
-/*! GEMM instruction start position of the wgt_factor_in field */
-#define VTA_INSN_GEM_F_0 (VTA_INSN_GEM_E_1 + 1)
-/*! GEMM instruction end position of the wgt_factor_in field */
-#define VTA_INSN_GEM_F_1 (VTA_INSN_GEM_F_0 + VTA_LOG_WGT_BUFF_DEPTH - 1)
-
-/*! ALU instruction start position of the alu_opcode field */
-#define VTA_INSN_ALU_E_0 (VTA_INSN_GEM_D_1 + 1)
-/*! ALU instruction end position of the alu_opcode field */
-#define VTA_INSN_ALU_E_1 (VTA_INSN_ALU_E_0 + VTA_ALU_OPCODE_BIT_WIDTH - 1)
-/*! ALU instruction position of the use_imm field */
-#define VTA_INSN_ALU_F   (VTA_INSN_ALU_E_1 + 1)
-/*! ALU instruction start position of the immediate field */
-#define VTA_INSN_ALU_G_0 (VTA_INSN_ALU_F + 1)
-/*! ALU instruction end position of the immediate field */
-#define VTA_INSN_ALU_G_1 (VTA_INSN_ALU_G_0 + VTA_ALUOP_IMM_BIT_WIDTH - 1)
-
 /*! GEMM Micro-op start position of the acc_idx field */
 #define VTA_UOP_GEM_0_0 0
 /*! GEMM Micro-op end position of the acc_idx field */
@@ -368,8 +201,20 @@ extern "C" {
 
 /*! \brief VTA generic instruction */
 typedef struct {
-  uint64_t word_0         : 64;
-  uint64_t word_1         : 64;
+  /*! \brief The instruction opcode */
+  uint64_t opcode         : VTA_OPCODE_BIT_WIDTH;
+  /*! \brief Unused in this instruction */
+  uint64_t pop_prev_dep   : 1;
+  /*! \brief Pop dependence token from GEMM stage */
+  uint64_t pop_next_dep   : 1;
+  /*! \brief Unused in this instruction */
+  uint64_t push_prev_dep  : 1;
+  /*! \brief Push dependence token to GEMM stage */
+  uint64_t push_next_dep  : 1;
+  /*! \brief Padding */
+  uint64_t pad_0          : 64 - VTA_OPCODE_BIT_WIDTH - 4;
+  /*! \brief Padding */
+  uint64_t pad_1          : 64;
 } VTAGenericInsn;
 
 /*! \brief VTA load/store instruction
diff --git a/vta/python/vta/bitstream.py b/vta/python/vta/bitstream.py
index 7c5ee5523e38..b3d7df49328e 100644
--- a/vta/python/vta/bitstream.py
+++ b/vta/python/vta/bitstream.py
@@ -45,10 +45,11 @@ def get_bitstream_path():
     # Derive destination path
     cache_dir = os.getenv("VTA_CACHE_PATH", os.path.join(os.getenv("HOME"), ".vta_cache/"))
     cache_dir = os.path.join(cache_dir, env.TARGET)
+    cache_dir = os.path.join(cache_dir, env.HW_VER.replace('.', '_'))
     # Create the directory if it didn't exist
     if not os.path.exists(cache_dir):
         os.makedirs(cache_dir)
-    bit_path = os.path.join(cache_dir, env.BITSTREAM)
+    bit_path = os.path.join(cache_dir, env.BITSTREAM) + ".bit"
 
     return bit_path
 
@@ -63,7 +64,7 @@ def download_bitstream():
     bit = get_bitstream_path()
     url = os.path.join(BITSTREAM_URL, env.TARGET)
     url = os.path.join(url, env.HW_VER)
-    url = os.path.join(url, env.BITSTREAM)
+    url = os.path.join(url, env.BITSTREAM + ".bit")
 
     try:
         download(url, bit)
diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py
index 093b0ec5c386..ee2428be828b 100644
--- a/vta/python/vta/environment.py
+++ b/vta/python/vta/environment.py
@@ -113,15 +113,9 @@ class Environment(object):
 
     # initialization function
     def __init__(self, cfg):
-        self.__dict__.update(cfg)
-        for key in PkgConfig.cfg_keys:
-            if key not in cfg:
-                raise ValueError("Expect key %s in cfg" % key)
-        # derive output buffer size
-        self.LOG_OUT_BUFF_SIZE = (
-            self.LOG_ACC_BUFF_SIZE +
-            self.LOG_OUT_WIDTH -
-            self.LOG_ACC_WIDTH)
+        # Produce the derived parameters and update dict
+        self.pkg = self.pkg_config(cfg)
+        self.__dict__.update(self.pkg.cfg_dict)
         # data type width
         self.INP_WIDTH = 1 << self.LOG_INP_WIDTH
         self.WGT_WIDTH = 1 << self.LOG_WGT_WIDTH
@@ -154,25 +148,15 @@ def __init__(self, cfg):
         self.WGT_ELEM_BYTES = self.WGT_ELEM_BITS // 8
         self.ACC_ELEM_BYTES = self.ACC_ELEM_BITS // 8
         self.OUT_ELEM_BYTES = self.OUT_ELEM_BITS // 8
-        # Configuration bitstream name
-        self.BITSTREAM = "{}x{}x{}_{}bx{}b_{}_{}_{}_{}_{}MHz_{}ns_v{}.bit".format(
-            (1 << cfg["LOG_BATCH"]),
-            (1 << cfg["LOG_BLOCK_IN"]),
-            (1 << cfg["LOG_BLOCK_OUT"]),
-            (1 << cfg["LOG_INP_WIDTH"]),
-            (1 << cfg["LOG_WGT_WIDTH"]),
-            cfg["LOG_UOP_BUFF_SIZE"],
-            cfg["LOG_INP_BUFF_SIZE"],
-            cfg["LOG_WGT_BUFF_SIZE"],
-            cfg["LOG_ACC_BUFF_SIZE"],
-            cfg["HW_FREQ"],
-            cfg["HW_CLK_TARGET"],
-            cfg["HW_VER"].replace('.', '_'))
         # dtypes
         self.acc_dtype = "int%d" % self.ACC_WIDTH
         self.inp_dtype = "int%d" % self.INP_WIDTH
         self.wgt_dtype = "int%d" % self.WGT_WIDTH
         self.out_dtype = "int%d" % self.OUT_WIDTH
+        # bistream name
+        self.BITSTREAM = self.pkg.bitstream
+        # model string
+        self.MODEL = self.TARGET + "_" + self.BITSTREAM
         # lazy cached members
         self.mock_mode = False
         self._mock_env = None
@@ -187,11 +171,15 @@ def __enter__(self):
     def __exit__(self, ptype, value, trace):
         Environment.current = self._last_env
 
-    def pkg_config(self):
+    def pkg_config(self, cfg):
         """PkgConfig instance"""
         curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
         proj_root = os.path.abspath(os.path.join(curr_path, "../../"))
-        return PkgConfig(self.__dict__, proj_root)
+        return PkgConfig(cfg, proj_root)
+
+    @property
+    def cfg_dict(self):
+        return self.pkg.cfg_dict
 
     @property
     def dev(self):
@@ -236,13 +224,15 @@ def gemm(self):
 
     @property
     def target(self):
-        return tvm.target.vta(model=self.TARGET)
+        return tvm.target.vta(model=self.MODEL)
 
     @property
     def target_host(self):
         """The target host"""
         if self.TARGET == "pynq":
             return "llvm -target=armv7-none-linux-gnueabihf"
+        if self.TARGET == "ultra96":
+            return "llvm -target=aarch64-linux-gnu"
         if self.TARGET == "sim" or self.TARGET == "tsim":
             return "llvm"
         raise ValueError("Unknown target %s" % self.TARGET)
@@ -316,21 +306,18 @@ def coproc_dep_pop(op):
 
 
 def _init_env():
-    """Iniitalize the default global env"""
+    """Initialize the default global env"""
     curr_path = os.path.dirname(
         os.path.abspath(os.path.expanduser(__file__)))
     proj_root = os.path.abspath(os.path.join(curr_path, "../../../"))
     path_list = [
-        os.path.join(curr_path, "vta_config.json"),
-        os.path.join(proj_root, "build", "vta_config.json"),
-        os.path.join(proj_root, "vta_config.json"),
         os.path.join(proj_root, "vta/config/vta_config.json")
     ]
     path_list = [p for p in path_list if os.path.exists(p)]
     if not path_list:
         raise RuntimeError(
-            "Error: {} not found.make sure you have config.json in your vta root"
-            .format(filename))
-    return Environment(json.load(open(path_list[0])))
+            "Error: vta_config.json not found.")
+    cfg = json.load(open(path_list[0]))
+    return Environment(cfg)
 
 Environment.current = _init_env()
diff --git a/vta/python/vta/pkg_config.py b/vta/python/vta/pkg_config.py
index 3977d5aa2e43..5390ee0de9a8 100644
--- a/vta/python/vta/pkg_config.py
+++ b/vta/python/vta/pkg_config.py
@@ -38,49 +38,209 @@ class PkgConfig(object):
     """
     cfg_keys = [
         "TARGET",
-        "HW_FREQ",
-        "HW_CLK_TARGET",
-        "HW_VER",
         "LOG_INP_WIDTH",
         "LOG_WGT_WIDTH",
         "LOG_ACC_WIDTH",
-        "LOG_OUT_WIDTH",
         "LOG_BATCH",
-        "LOG_BLOCK_IN",
-        "LOG_BLOCK_OUT",
+        "LOG_BLOCK",
         "LOG_UOP_BUFF_SIZE",
         "LOG_INP_BUFF_SIZE",
         "LOG_WGT_BUFF_SIZE",
         "LOG_ACC_BUFF_SIZE",
     ]
+
     def __init__(self, cfg, proj_root):
-        # include path
+
+        # Derived parameters
+        cfg["LOG_BLOCK_IN"] = cfg["LOG_BLOCK"]
+        cfg["LOG_BLOCK_OUT"] = cfg["LOG_BLOCK"]
+        cfg["LOG_OUT_WIDTH"] = cfg["LOG_INP_WIDTH"]
+        cfg["LOG_OUT_BUFF_SIZE"] = (
+            cfg["LOG_ACC_BUFF_SIZE"] +
+            cfg["LOG_OUT_WIDTH"] -
+            cfg["LOG_ACC_WIDTH"])
+
+        # Update cfg now that we've extended it
+        self.__dict__.update(cfg)
+
+        # Include path
         self.include_path = [
             "-I%s/include" % proj_root,
             "-I%s/vta/include" % proj_root,
             "-I%s/3rdparty/dlpack/include" % proj_root,
             "-I%s/3rdparty/dmlc-core/include" % proj_root
         ]
+
         # List of source files that can be used to build standalone library.
         self.lib_source = []
         self.lib_source += glob.glob("%s/vta/src/*.cc" % proj_root)
-        self.lib_source += glob.glob("%s/vta/src/%s/*.cc" % (proj_root, cfg["TARGET"]))
-        # macro keys
-        self.macro_defs = []
-        self.cfg_dict = {}
-        for key in self.cfg_keys:
-            self.macro_defs.append("-DVTA_%s=%s" % (key, str(cfg[key])))
-            self.cfg_dict[key] = cfg[key]
+        if self.TARGET in ["pynq", "ultra96"]:
+            # add pynq drivers for any board that uses pynq driver stack (see pynq.io)
+            self.lib_source += glob.glob("%s/vta/src/pynq/*.cc" % (proj_root))
 
-        self.target = cfg["TARGET"]
-
-        if self.target == "pynq":
+        # Linker flags
+        if self.TARGET in ["pynq", "ultra96"]:
             self.ldflags = [
                 "-L/usr/lib",
                 "-l:libcma.so"]
         else:
             self.ldflags = []
 
+        # Derive bitstream config string.
+        self.bitstream = "{}x{}_i{}w{}a{}_{}_{}_{}_{}".format(
+            (1 << cfg["LOG_BATCH"]),
+            (1 << cfg["LOG_BLOCK"]),
+            (1 << cfg["LOG_INP_WIDTH"]),
+            (1 << cfg["LOG_WGT_WIDTH"]),
+            (1 << cfg["LOG_ACC_WIDTH"]),
+            cfg["LOG_UOP_BUFF_SIZE"],
+            cfg["LOG_INP_BUFF_SIZE"],
+            cfg["LOG_WGT_BUFF_SIZE"],
+            cfg["LOG_ACC_BUFF_SIZE"])
+
+        # Derive FPGA parameters from target
+        #   - device:           part number
+        #   - family:           fpga family
+        #   - freq:             PLL frequency
+        #   - per:              clock period to achieve in HLS
+        #                       (how aggressively design is pipelined)
+        #   - axi_bus_width:    axi bus width used for DMA transactions
+        #                       (property of FPGA memory interface)
+        #   - axi_cache_bits:   ARCACHE/AWCACHE signals for the AXI bus
+        #                       (e.g. 1111 is write-back read and write allocate)
+        #   - axi_prot_bits:    ARPROT/AWPROT signals for the AXI bus
+        if self.TARGET == "ultra96":
+            self.fpga_device = "xczu3eg-sbva484-1-e"
+            self.fpga_family = "zynq-ultrascale+"
+            self.fpga_freq = 333
+            self.fpga_per = 2
+            self.fpga_log_axi_bus_width = 7
+            self.axi_prot_bits = '010'
+            # IP register address map
+            self.ip_reg_map_range = "0x1000"
+            self.fetch_base_addr = "0xA0000000"
+            self.load_base_addr = "0xA0001000"
+            self.compute_base_addr = "0xA0002000"
+            self.store_base_addr = "0xA0003000"
+        else:
+            # By default, we use the pynq parameters
+            self.fpga_device = "xc7z020clg484-1"
+            self.fpga_family = "zynq-7000"
+            self.fpga_freq = 100
+            self.fpga_per = 7
+            self.fpga_log_axi_bus_width = 6
+            self.axi_prot_bits = '000'
+            # IP register address map
+            self.ip_reg_map_range = "0x1000"
+            self.fetch_base_addr = "0x43C00000"
+            self.load_base_addr = "0x43C01000"
+            self.compute_base_addr = "0x43C02000"
+            self.store_base_addr = "0x43C03000"
+        # Set coherence settings
+        coherent = True
+        if coherent:
+            self.axi_cache_bits = '1111'
+            self.coherent = True
+
+        # Define IP memory mapped registers offsets.
+        # In HLS 0x00-0x0C is reserved for block-level I/O protocol.
+        # Make sure to leave 8B between register offsets to maintain
+        # compatibility with 64bit systems.
+        self.fetch_insn_count_offset = 0x10
+        self.fetch_insn_addr_offset = self.fetch_insn_count_offset + 0x08
+        self.load_inp_addr_offset = 0x10
+        self.load_wgt_addr_offset = self.load_inp_addr_offset + 0x08
+        self.compute_done_wr_offet = 0x10
+        self.compute_done_rd_offet = self.compute_done_wr_offet + 0x08
+        self.compute_uop_addr_offset = self.compute_done_rd_offet + 0x08
+        self.compute_bias_addr_offset = self.compute_uop_addr_offset + 0x08
+        self.store_out_addr_offset = 0x10
+
+        # Derive SRAM parameters
+        # The goal here is to determine how many memory banks are needed,
+        # how deep and wide each bank needs to be. This is derived from
+        # the size of each memory element (result of data width, and tensor shape),
+        # and also how wide a memory can be as permitted by the FPGA tools.
+        #
+        # The mem axi ratio is a parameter used by HLS to resize memories
+        # so memory read/write ports are the same size as the design axi bus width.
+        #
+        # Max bus width allowed (property of FPGA vendor toolchain)
+        max_bus_width = 1024
+        # Bus width of a memory interface
+        mem_bus_width = 1 << self.fpga_log_axi_bus_width
+        # Input memory
+        inp_mem_bus_width = 1 << (cfg["LOG_INP_WIDTH"] + \
+                                  cfg["LOG_BATCH"] + \
+                                  cfg["LOG_BLOCK_IN"])
+        self.inp_mem_size = 1 << cfg["LOG_INP_BUFF_SIZE"]  # bytes
+        self.inp_mem_banks = (inp_mem_bus_width + \
+                              max_bus_width - 1) // \
+                              max_bus_width
+        self.inp_mem_width = min(inp_mem_bus_width, max_bus_width)
+        self.inp_mem_depth = self.inp_mem_size * 8 // inp_mem_bus_width
+        self.inp_mem_axi_ratio = self.inp_mem_width // mem_bus_width
+        # Weight memory
+        wgt_mem_bus_width = 1 << (cfg["LOG_WGT_WIDTH"] + \
+                                  cfg["LOG_BLOCK_IN"] + \
+                                  cfg["LOG_BLOCK_OUT"])
+        self.wgt_mem_size = 1 << cfg["LOG_WGT_BUFF_SIZE"]  # bytes
+        self.wgt_mem_banks = (wgt_mem_bus_width + \
+                              max_bus_width - 1) // \
+                              max_bus_width
+        self.wgt_mem_width = min(wgt_mem_bus_width, max_bus_width)
+        self.wgt_mem_depth = self.wgt_mem_size * 8 // wgt_mem_bus_width
+        self.wgt_mem_axi_ratio = self.wgt_mem_width // mem_bus_width
+        # Output memory
+        out_mem_bus_width = 1 << (cfg["LOG_OUT_WIDTH"] + \
+                                  cfg["LOG_BATCH"] + \
+                                  cfg["LOG_BLOCK_OUT"])
+        self.out_mem_size = 1 << cfg["LOG_OUT_BUFF_SIZE"]  # bytes
+        self.out_mem_banks = (out_mem_bus_width + \
+                              max_bus_width - 1) // \
+                              max_bus_width
+        self.out_mem_width = min(out_mem_bus_width, max_bus_width)
+        self.out_mem_depth = self.out_mem_size * 8 // out_mem_bus_width
+        self.out_mem_axi_ratio = self.out_mem_width // mem_bus_width
+
+        # Macro defs
+        self.macro_defs = []
+        self.cfg_dict = {}
+        for key in cfg:
+            self.macro_defs.append("-DVTA_%s=%s" % (key, str(cfg[key])))
+            self.cfg_dict[key] = cfg[key]
+        self.macro_defs.append("-DVTA_LOG_BUS_WIDTH=%s" % (self.fpga_log_axi_bus_width))
+        # Macros used by the VTA driver
+        self.macro_defs.append("-DVTA_IP_REG_MAP_RANGE=%s" % (self.ip_reg_map_range))
+        self.macro_defs.append("-DVTA_FETCH_ADDR=%s" % (self.fetch_base_addr))
+        self.macro_defs.append("-DVTA_LOAD_ADDR=%s" % (self.load_base_addr))
+        self.macro_defs.append("-DVTA_COMPUTE_ADDR=%s" % (self.compute_base_addr))
+        self.macro_defs.append("-DVTA_STORE_ADDR=%s" % (self.store_base_addr))
+        # IP register offsets
+        self.macro_defs.append("-DVTA_FETCH_INSN_COUNT_OFFSET=%s" % \
+                (self.fetch_insn_count_offset))
+        self.macro_defs.append("-DVTA_FETCH_INSN_ADDR_OFFSET=%s" % \
+                (self.fetch_insn_addr_offset))
+        self.macro_defs.append("-DVTA_LOAD_INP_ADDR_OFFSET=%s" % \
+                (self.load_inp_addr_offset))
+        self.macro_defs.append("-DVTA_LOAD_WGT_ADDR_OFFSET=%s" % \
+                (self.load_wgt_addr_offset))
+        self.macro_defs.append("-DVTA_COMPUTE_DONE_WR_OFFSET=%s" % \
+                (self.compute_done_wr_offet))
+        self.macro_defs.append("-DVTA_COMPUTE_DONE_RD_OFFSET=%s" % \
+                (self.compute_done_rd_offet))
+        self.macro_defs.append("-DVTA_COMPUTE_UOP_ADDR_OFFSET=%s" % \
+                (self.compute_uop_addr_offset))
+        self.macro_defs.append("-DVTA_COMPUTE_BIAS_ADDR_OFFSET=%s" % \
+                (self.compute_bias_addr_offset))
+        self.macro_defs.append("-DVTA_STORE_OUT_ADDR_OFFSET=%s" % \
+                (self.store_out_addr_offset))
+        # Coherency
+        if coherent:
+            self.macro_defs.append("-DVTA_COHERENT_ACCESSES=true")
+        else:
+            self.macro_defs.append("-DVTA_COHERENT_ACCESSES=false")
+
     @property
     def cflags(self):
         return self.include_path + self.macro_defs
diff --git a/vta/python/vta/program_bitstream.py b/vta/python/vta/program_bitstream.py
index 5c5a86293885..e735a4cf252c 100644
--- a/vta/python/vta/program_bitstream.py
+++ b/vta/python/vta/program_bitstream.py
@@ -48,9 +48,12 @@ def pynq_bitstream_program(bitstream_path):
     bitstream.download()
 
 def bitstream_program(target, bitstream):
-    if target == 'pynq':
+    if target in ['pynq', 'ultra96']:
         pynq_bitstream_program(bitstream)
-    elif target != 'sim':
+    elif target in ['sim', 'tsim']:
+        # In simulation, bit stream programming is a no-op
+        return
+    else:
         raise RuntimeError("Unknown target {}".format(target))
 
 if __name__ == "__main__":
diff --git a/vta/python/vta/rpc_client.py b/vta/python/vta/rpc_client.py
index a5bafab498a5..f689ef46ba1c 100644
--- a/vta/python/vta/rpc_client.py
+++ b/vta/python/vta/rpc_client.py
@@ -30,7 +30,7 @@ def reconfig_runtime(remote):
     """
     env = get_env()
     freconfig = remote.get_function("tvm.contrib.vta.reconfig_runtime")
-    freconfig(env.pkg_config().cfg_json)
+    freconfig(env.pkg.cfg_json)
 
 
 def program_fpga(remote, bitstream=None):
diff --git a/vta/python/vta/testing/util.py b/vta/python/vta/testing/util.py
index b748cdf23358..67fc6b275b79 100644
--- a/vta/python/vta/testing/util.py
+++ b/vta/python/vta/testing/util.py
@@ -33,7 +33,6 @@ def run(run_func):
     env = get_env()
 
     if env.TARGET in ["sim", "tsim"]:
-
         # Talk to local RPC if necessary to debug RPC server.
         # Compile vta on your host with make at the root.
         # Make sure TARGET is set to "sim" in the config.json file.
@@ -53,21 +52,20 @@ def run(run_func):
                 assert simulator.enabled()
             run_func(env, rpc.LocalSession())
 
-    elif env.TARGET == "pynq":
-
+    elif env.TARGET in ["pynq", "ultra96"]:
         # The environment variables below should be set if we are using
         # a tracker to obtain a remote for a test device
-        tracket_host = os.environ.get("TVM_TRACKER_HOST", None)
-        tracket_port = os.environ.get("TVM_TRACKER_PORT", None)
+        tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
+        tracker_port = os.environ.get("TVM_TRACKER_PORT", None)
         # Otherwise, we can set the variables below to directly
         # obtain a remote from a test device
         pynq_host = os.environ.get("VTA_PYNQ_RPC_HOST", None)
         pynq_port = os.environ.get("VTA_PYNQ_RPC_PORT", None)
         # Run device from fleet node if env variables are defined
-        if tracket_host and tracket_port:
+        if tracker_host and tracker_port:
             remote = autotvm.measure.request_remote(env.TARGET,
-                                                    tracket_host,
-                                                    int(tracket_port),
+                                                    tracker_host,
+                                                    int(tracker_port),
                                                     timeout=10000)
             run_func(env, remote)
         else:
@@ -78,3 +76,6 @@ def run(run_func):
             else:
                 raise RuntimeError(
                     "Please set the VTA_PYNQ_RPC_HOST and VTA_PYNQ_RPC_PORT environment variables")
+
+    else:
+        raise RuntimeError("Unknown target %s" % env.TARGET)
diff --git a/vta/src/pynq/pynq_driver.cc b/vta/src/pynq/pynq_driver.cc
index 47ca604d9d39..a37bb4e466af 100644
--- a/vta/src/pynq/pynq_driver.cc
+++ b/vta/src/pynq/pynq_driver.cc
@@ -15,12 +15,9 @@
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
- */
-
-/*!
- *  Copyright (c) 2018 by Contributors
+ *
  * \file pynq_driver.c
- * \brief VTA driver for Pynq board.
+ * \brief VTA driver for Zynq SoC boards with Pynq support (see pynq.io).
  */
 
 #include <vta/driver.h>
@@ -53,19 +50,19 @@ void VTAMemCopyToHost(void* dst, const void* src, size_t size) {
   memcpy(dst, src, size);
 }
 
-void VTAFlushCache(vta_phy_addr_t buf, int size) {
-  // Call the xlnkFlushCache on the CMA buffer
+void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
+  // Call the cma_flush_cache on the CMA buffer
   // so that the FPGA can read the buffer data.
-  xlnkFlushCache(reinterpret_cast<void*>(buf), size);
+  cma_flush_cache(vir_addr, phy_addr, size);
 }
 
-void VTAInvalidateCache(vta_phy_addr_t buf, int size) {
-  // Call the xlnkInvalidateCache on the CMA buffer
+void VTAInvalidateCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
+  // Call the cma_invalidate_cache on the CMA buffer
   // so that the host needs to read the buffer data.
-  xlnkInvalidateCache(reinterpret_cast<void*>(buf), size);
+  cma_invalidate_cache(vir_addr, phy_addr, size);
 }
 
-void *VTAMapRegister(uint32_t addr, size_t length) {
+void *VTAMapRegister(uint32_t addr) {
   // Align the base address with the pages
   uint32_t virt_base = addr & ~(getpagesize() - 1);
   // Calculate base address offset w.r.t the base address
@@ -73,16 +70,16 @@ void *VTAMapRegister(uint32_t addr, size_t length) {
   // Open file and mmap
   uint32_t mmap_file = open("/dev/mem", O_RDWR|O_SYNC);
   return mmap(NULL,
-              (length+virt_offset),
+              (VTA_IP_REG_MAP_RANGE + virt_offset),
               PROT_READ|PROT_WRITE,
               MAP_SHARED,
               mmap_file,
               virt_base);
 }
 
-void VTAUnmapRegister(void *vta, size_t length) {
+void VTAUnmapRegister(void *vta) {
   // Unmap memory
-  int status = munmap(vta, length);
+  int status = munmap(vta, VTA_IP_REG_MAP_RANGE);
   assert(status == 0);
 }
 
@@ -98,39 +95,30 @@ class VTADevice {
  public:
   VTADevice() {
     // VTA stage handles
-    vta_fetch_handle_ = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
-    vta_load_handle_ = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
-    vta_compute_handle_ = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
-    vta_store_handle_ = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
+    vta_fetch_handle_ = VTAMapRegister(VTA_FETCH_ADDR);
+    vta_load_handle_ = VTAMapRegister(VTA_LOAD_ADDR);
+    vta_compute_handle_ = VTAMapRegister(VTA_COMPUTE_ADDR);
+    vta_store_handle_ = VTAMapRegister(VTA_STORE_ADDR);
   }
 
   ~VTADevice() {
     // Close VTA stage handle
-    VTAUnmapRegister(vta_fetch_handle_, VTA_RANGE);
-    VTAUnmapRegister(vta_load_handle_, VTA_RANGE);
-    VTAUnmapRegister(vta_compute_handle_, VTA_RANGE);
-    VTAUnmapRegister(vta_store_handle_, VTA_RANGE);
+    VTAUnmapRegister(vta_fetch_handle_);
+    VTAUnmapRegister(vta_load_handle_);
+    VTAUnmapRegister(vta_compute_handle_);
+    VTAUnmapRegister(vta_store_handle_);
   }
 
   int Run(vta_phy_addr_t insn_phy_addr,
           uint32_t insn_count,
           uint32_t wait_cycles) {
-    // NOTE: Register address map is derived from the auto-generated
-    // driver files available under hardware/build/vivado/<design>/export/driver
-    // FETCH @ 0x10 : Data signal of insn_count_V
-    VTAWriteMappedReg(vta_fetch_handle_, 0x10, insn_count);
-    // FETCH @ 0x18 : Data signal of insns_V
-    VTAWriteMappedReg(vta_fetch_handle_, 0x18, insn_phy_addr);
-    // LOAD @ 0x10 : Data signal of inputs_V
-    VTAWriteMappedReg(vta_load_handle_, 0x10, 0);
-    // LOAD @ 0x18 : Data signal of weight_V
-    VTAWriteMappedReg(vta_load_handle_, 0x18, 0);
-    // COMPUTE @ 0x20 : Data signal of uops_V
-    VTAWriteMappedReg(vta_compute_handle_, 0x20, 0);
-    // COMPUTE @ 0x28 : Data signal of biases_V
-    VTAWriteMappedReg(vta_compute_handle_, 0x28, 0);
-    // STORE @ 0x10 : Data signal of outputs_V
-    VTAWriteMappedReg(vta_store_handle_, 0x10, 0);
+    VTAWriteMappedReg(vta_fetch_handle_, VTA_FETCH_INSN_COUNT_OFFSET, insn_count);
+    VTAWriteMappedReg(vta_fetch_handle_, VTA_FETCH_INSN_ADDR_OFFSET, insn_phy_addr);
+    VTAWriteMappedReg(vta_load_handle_, VTA_LOAD_INP_ADDR_OFFSET, 0);
+    VTAWriteMappedReg(vta_load_handle_, VTA_LOAD_WGT_ADDR_OFFSET, 0);
+    VTAWriteMappedReg(vta_compute_handle_, VTA_COMPUTE_UOP_ADDR_OFFSET, 0);
+    VTAWriteMappedReg(vta_compute_handle_, VTA_COMPUTE_BIAS_ADDR_OFFSET, 0);
+    VTAWriteMappedReg(vta_store_handle_, VTA_STORE_OUT_ADDR_OFFSET, 0);
 
     // VTA start
     VTAWriteMappedReg(vta_fetch_handle_, 0x0, VTA_START);
@@ -141,7 +129,7 @@ class VTADevice {
     // Loop until the VTA is done
     unsigned t, flag = 0;
     for (t = 0; t < wait_cycles; ++t) {
-      flag = VTAReadMappedReg(vta_compute_handle_, 0x18);
+      flag = VTAReadMappedReg(vta_compute_handle_, VTA_COMPUTE_DONE_RD_OFFSET);
       if (flag == VTA_DONE) break;
       std::this_thread::yield();
     }
diff --git a/vta/src/pynq/pynq_driver.h b/vta/src/pynq/pynq_driver.h
index 7cfee4cf0958..bb6ca3db2b93 100644
--- a/vta/src/pynq/pynq_driver.h
+++ b/vta/src/pynq/pynq_driver.h
@@ -6,21 +6,18 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
- */
-
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file vta_pynq_driver.h
- * \brief VTA driver for Pynq board.
+ *
+ * \file pynq_driver.h
+ * \brief VTA driver for Zynq SoC boards with Pynq support (see pynq.io).
  */
 
 #ifndef VTA_PYNQ_PYNQ_DRIVER_H_
@@ -41,23 +38,21 @@ extern "C" {
 #include <time.h>
 #include <unistd.h>
 
-#ifdef __arm__
+#if defined(__arm__) || defined(__aarch64__)
 #include <libxlnk_cma.h>
 #else
 void* cma_alloc(size_t size, int cached);
 void cma_free(void* buf);
 uint32_t cma_get_phy_addr(void* buf);
+void cma_flush_cache(void* buf, unsigned int phys_addr, int size);
+void cma_invalidate_cache(void* buf, unsigned int phys_addr, int size);
 #endif
-void xlnkFlushCache(void* buf, int size);
-void xlnkInvalidateCache(void* buf, int size);
 
-void *VTAMapRegister(uint32_t addr, size_t length);
-void VTAUnmapRegister(void *vta, size_t length);
+void *VTAMapRegister(uint32_t addr);
+void VTAUnmapRegister(void *vta);
 void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val);
 uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset);
 
-/*! \brief VTA configuration register address range */
-#define VTA_RANGE 0x100
 /*! \brief VTA configuration register start value */
 #define VTA_START 0x1
 /*! \brief VTA configuration register auto-restart value */
@@ -65,27 +60,6 @@ uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset);
 /*! \brief VTA configuration register done value */
 #define VTA_DONE 0x1
 
-/*! \brief VTA fetch stage configuration register address
-*   from auto-generated XPAR_FETCH_0_S_AXI_CONTROL_BUS_BASEADDR define
-*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
-*/
-#define VTA_FETCH_ADDR    0x43C00000
-/*! \brief VTA compute stage configuration register address
-*   from auto-generated XPAR_COMPUTE_0_S_AXI_CONTROL_BUS_BASEADDR define
-*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
-*/
-#define VTA_COMPUTE_ADDR  0x43C10000
-/*! \brief VTA compute stage configuration register address
-*   from auto-generated XPAR_LOAD_0_S_AXI_CONTROL_BUS_BASEADDR define
-*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
-*/
-#define VTA_LOAD_ADDR     0x43C20000
-/*! \brief VTA store stage configuration register address
-*   from auto-generated XPAR_STORE_0_S_AXI_CONTROL_BUS_BASEADDR define
-*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
-*/
-#define VTA_STORE_ADDR    0x43C30000
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/vta/src/runtime.cc b/vta/src/runtime.cc
index cebfaf7bb68f..4a6552892f4e 100644
--- a/vta/src/runtime.cc
+++ b/vta/src/runtime.cc
@@ -44,8 +44,10 @@ namespace vta {
 static_assert(VTA_UOP_WIDTH == sizeof(VTAUop) * 8,
               "VTA_UOP_WIDTH do not match VTAUop size");
 
-/*! \brief Enable coherent access between VTA and CPU (used on shared mem systems). */
-static const bool kBufferCoherent = true;
+/*! \brief Enable coherent access of data buffers between VTA and CPU */
+static const bool kBufferCoherent = VTA_COHERENT_ACCESSES;
+/*! \brief Always cache buffers (otherwise, write back to DRAM from CPU) */
+static const bool kAlwaysCache = true;
 
 /*!
  * \brief Data buffer represents data on CMA.
@@ -65,8 +67,10 @@ struct DataBuffer {
    * \param size The size of the data.
    */
   void InvalidateCache(size_t offset, size_t size) {
-    if (!kBufferCoherent) {
-      VTAInvalidateCache(phy_addr_ + offset, size);
+    if (!kBufferCoherent && kAlwaysCache) {
+      VTAInvalidateCache(reinterpret_cast<char *>(data_) + offset,
+                         phy_addr_ + offset,
+                         size);
     }
   }
   /*!
@@ -75,8 +79,10 @@ struct DataBuffer {
    * \param size The size of the data.
    */
   void FlushCache(size_t offset, size_t size) {
-    if (!kBufferCoherent) {
-      VTAFlushCache(phy_addr_ + offset, size);
+    if (!kBufferCoherent && kAlwaysCache) {
+      VTAFlushCache(reinterpret_cast<char *>(data_) + offset,
+                    phy_addr_ + offset,
+                    size);
     }
   }
   /*!
@@ -102,7 +108,7 @@ struct DataBuffer {
    * \param size The size of the buffer.
    */
   static DataBuffer* Alloc(size_t size) {
-    void* data = VTAMemAlloc(size, 1);
+    void* data = VTAMemAlloc(size, kAlwaysCache);
     CHECK(data != nullptr);
     DataBuffer* buffer = new DataBuffer();
     buffer->data_ = data;
@@ -469,7 +475,9 @@ class UopQueue : public BaseQueue<VTAUop> {
     // Flush if we're using a shared memory system
     // and if interface is non-coherent
     if (!coherent_ && always_cache_) {
-      VTAFlushCache(fpga_buff_phy_, offset);
+      VTAFlushCache(fpga_buff_,
+                    fpga_buff_phy_,
+                    offset);
     }
   }
 
@@ -860,7 +868,9 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
     // Flush if we're using a shared memory system
     // and if interface is non-coherent
     if (!coherent_ && always_cache_) {
-      VTAFlushCache(fpga_buff_phy_, buff_size);
+      VTAFlushCache(fpga_buff_,
+                    fpga_buff_phy_,
+                    buff_size);
     }
   }
 
@@ -1302,9 +1312,9 @@ class CommandQueue {
   // The kernel we are currently recording
   UopKernel* record_kernel_{nullptr};
   // Micro op queue
-  UopQueue<VTA_MAX_XFER, true, true> uop_queue_;
+  UopQueue<VTA_MAX_XFER, kBufferCoherent, kAlwaysCache> uop_queue_;
   // instruction queue
-  InsnQueue<VTA_MAX_XFER, true, true> insn_queue_;
+  InsnQueue<VTA_MAX_XFER, kBufferCoherent, kAlwaysCache> insn_queue_;
   // Device handle
   VTADeviceHandle device_{nullptr};
 #ifdef USE_TSIM
diff --git a/vta/src/sim/sim_driver.cc b/vta/src/sim/sim_driver.cc
index 9d81befdaa6e..ca0fd7ec521a 100644
--- a/vta/src/sim/sim_driver.cc
+++ b/vta/src/sim/sim_driver.cc
@@ -615,10 +615,10 @@ void VTAMemCopyToHost(void* dst, const void* src, size_t size) {
   memcpy(dst, src, size);
 }
 
-void VTAFlushCache(vta_phy_addr_t buf, int size) {
+void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
 }
 
-void VTAInvalidateCache(vta_phy_addr_t buf, int size) {
+void VTAInvalidateCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
 }
 
 VTADeviceHandle VTADeviceAlloc() {
diff --git a/vta/src/tsim/tsim_driver.cc b/vta/src/tsim/tsim_driver.cc
index 799ee27e5a9a..a7bcc3c54ca8 100644
--- a/vta/src/tsim/tsim_driver.cc
+++ b/vta/src/tsim/tsim_driver.cc
@@ -228,10 +228,10 @@ void VTAMemCopyToHost(void* dst, const void* src, size_t size) {
   memcpy(dst, src, size);
 }
 
-void VTAFlushCache(vta_phy_addr_t buf, int size) {
+void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
 }
 
-void VTAInvalidateCache(vta_phy_addr_t buf, int size) {
+void VTAInvalidateCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
 }
 
 VTADeviceHandle VTADeviceAlloc() {
diff --git a/vta/tests/hardware/common/test_lib.cc b/vta/tests/hardware/common/test_lib.cc
index e88cede4d055..7c47c2c3e012 100644
--- a/vta/tests/hardware/common/test_lib.cc
+++ b/vta/tests/hardware/common/test_lib.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2018 by Contributors
  * \file test_lib.cpp
  * \brief Test library for the VTA design simulation and driver tests.
  */
@@ -32,10 +31,10 @@ uint64_t vta(
   uint32_t insn_count,
   VTAGenericInsn *insns,
   VTAUop *uops,
-  inp_T *inputs,
-  wgt_T *weights,
-  acc_T *biases,
-  inp_T *outputs) {
+  uint32_t *inputs,
+  uint32_t *weights,
+  uint32_t *biases,
+  uint32_t *outputs) {
   // Performance counter variables
   uint64_t t_fpga;
   struct timespec start, stop;
@@ -53,18 +52,18 @@ uint64_t vta(
   snprintf(bitstream, sizeof(bitstream), "%s", "vta.bit");
 
   // Get VTA handles
-  void* vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
-  void* vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
-  void* vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
-  void* vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
+  void* vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR);
+  void* vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR);
+  void* vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR);
+  void* vta_store_handle = VTAMapRegister(VTA_STORE_ADDR);
 
   // Physical address pointers
-  uint32_t insn_phy = insns ? VTAMemGetPhyAddr(insns) : 0;
-  uint32_t uop_phy = uops ? VTAMemGetPhyAddr(uops) : 0;
-  uint32_t input_phy = inputs ? VTAMemGetPhyAddr(inputs) : 0;
-  uint32_t weight_phy = weights ? VTAMemGetPhyAddr(weights) : 0;
-  uint32_t bias_phy = biases ? VTAMemGetPhyAddr(biases) : 0;
-  uint32_t output_phy = outputs ? VTAMemGetPhyAddr(outputs) : 0;
+  uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
+  uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
+  uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
+  uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
+  uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
+  uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
 
 #if VTA_DEBUG == 1
   printf("INFO - Starting FPGA!\n");
@@ -72,20 +71,13 @@ uint64_t vta(
 
   clock_gettime(CLOCK_REALTIME, &start);
 
-  // FETCH @ 0x10 : Data signal of insn_count_V
-  VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count);
-  // FETCH @ 0x18 : Data signal of insns_V
-  if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
-  // LOAD @ 0x10 : Data signal of inputs_V
-  if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy);
-  // LOAD @ 0x18 : Data signal of weight_V
-  if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy);
-  // COMPUTE @ 0x20 : Data signal of uops_V
-  if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy);
-  // COMPUTE @ 0x28 : Data signal of biases_V
-  if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy);
-  // STORE @ 0x10 : Data signal of outputs_V
-  if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy);
+  VTAWriteMappedReg(vta_fetch_handle, VTA_FETCH_INSN_COUNT_OFFSET, insn_count);
+  if (insns) VTAWriteMappedReg(vta_fetch_handle, VTA_FETCH_INSN_ADDR_OFFSET, insn_phy);
+  if (inputs) VTAWriteMappedReg(vta_load_handle, VTA_LOAD_INP_ADDR_OFFSET, input_phy);
+  if (weights) VTAWriteMappedReg(vta_load_handle, VTA_LOAD_WGT_ADDR_OFFSET, weight_phy);
+  if (uops) VTAWriteMappedReg(vta_compute_handle, VTA_COMPUTE_UOP_ADDR_OFFSET, uop_phy);
+  if (biases) VTAWriteMappedReg(vta_compute_handle, VTA_COMPUTE_BIAS_ADDR_OFFSET, bias_phy);
+  if (outputs) VTAWriteMappedReg(vta_store_handle, VTA_STORE_OUT_ADDR_OFFSET, output_phy);
 
   // VTA start
   VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1);
@@ -95,7 +87,7 @@ uint64_t vta(
 
   int flag = 0, t = 0;
   for (t = 0; t < 10000000; ++t) {
-    flag = VTAReadMappedReg(vta_compute_handle, 0x18);
+    flag = VTAReadMappedReg(vta_compute_handle, VTA_COMPUTE_DONE_RD_OFFSET);
     if (flag & VTA_DONE) break;
   }
 
@@ -111,10 +103,10 @@ uint64_t vta(
   t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
 
   // Unmap VTA register
-  VTAUnmapRegister(vta_fetch_handle, VTA_RANGE);
-  VTAUnmapRegister(vta_load_handle, VTA_RANGE);
-  VTAUnmapRegister(vta_compute_handle, VTA_RANGE);
-  VTAUnmapRegister(vta_store_handle, VTA_RANGE);
+  VTAUnmapRegister(vta_fetch_handle);
+  VTAUnmapRegister(vta_load_handle);
+  VTAUnmapRegister(vta_compute_handle);
+  VTAUnmapRegister(vta_store_handle);
 
   return t_fpga;
 }
@@ -147,27 +139,30 @@ const char* getOpcodeString(int opcode, bool use_imm) {
   } else if (opcode == VTA_ALU_OPCODE_SHR) {
     return "shr";
   }
+  // else if (opcode == VTA_ALU_OPCODE_MUL) {
+  //   return "mul";
+  // }
   return "unknown op";
 }
 
-template <typename T, int T_WIDTH>
-void packBuffer(T *dst, T **src, int y_size, int x_size, int y_block, int x_block) {
+template <typename DST_T, int DST_T_WIDTH, typename SRC_T, int SRC_T_WIDTH>
+void packBuffer(DST_T *dst, SRC_T **src, int y_size, int x_size, int y_block, int x_block) {
+  assert((SRC_T_WIDTH * x_block * y_block) % DST_T_WIDTH  == 0);
+  assert(DST_T_WIDTH <= 64);
   int buffer_idx = 0;
+  int ratio = DST_T_WIDTH / SRC_T_WIDTH;
+  long long int mask = (1ULL << SRC_T_WIDTH) - 1;
+  DST_T tmp = 0;
   for (int i = 0; i < y_size / y_block; i++) {
     for (int j = 0; j < x_size / x_block; j++) {
       for (int k = 0; k < y_block; k++) {
-        if (T_WIDTH < 8) {
-          for (int l = 0; l < x_block; l += 8 / T_WIDTH) {
-            dst[buffer_idx] = 0;
-            for (int m = 0; m < 8 / T_WIDTH; m++) {
-              dst[buffer_idx] |= (src[i * y_block + k][j * x_block + l + m] &
-                ((1ULL << T_WIDTH) - 1)) << (m * T_WIDTH);
-            }
-            buffer_idx++;
-          }
-        } else {
-          for (int l = 0; l < x_block; l++) {
-            dst[buffer_idx++] = src[i * y_block + k][j * x_block + l];
+        for (int l = 0; l < x_block; l++) {
+          int block_idx = l + k * x_block;
+          tmp |= (src[i * y_block + k][j * x_block + l] & mask) << ((block_idx % ratio) * SRC_T_WIDTH);
+          // When tmp is packed, write to destination array
+          if (block_idx % ratio == ratio - 1) {
+            dst[buffer_idx++] = tmp;
+            tmp = 0;
           }
         }
       }
@@ -175,31 +170,28 @@ void packBuffer(T *dst, T **src, int y_size, int x_size, int y_block, int x_bloc
   }
 }
 
-template <typename T, int T_WIDTH>
-void unpackBuffer(T **dst, T *src, int y_size, int x_size, int y_block, int x_block) {
+template <typename DST_T, int DST_T_WIDTH, typename SRC_T, int SRC_T_WIDTH>
+void unpackBuffer(DST_T **dst, SRC_T *src, int y_size, int x_size, int y_block, int x_block) {
+  assert((DST_T_WIDTH * x_block * y_block) % SRC_T_WIDTH == 0);
   int buffer_idx = 0;
+  long long int mask = (1ULL << DST_T_WIDTH) - 1;
+  int ratio = SRC_T_WIDTH / DST_T_WIDTH;
   for (int i = 0; i < y_size / y_block; i++) {
     for (int j = 0; j < x_size / x_block; j++) {
       for (int k = 0; k < y_block; k++) {
-        if (T_WIDTH < 8) {
-          for (int l = 0; l < x_block; l += 8 / T_WIDTH) {
-            for (int m = 0; m < 8 / T_WIDTH; m++) {
-              dst[i * y_block + k][j * x_block + l + m] = (src[buffer_idx] >> (m * T_WIDTH))
-                & ((1 << T_WIDTH) - 1);
-            }
+        for (int l = 0; l < x_block; l++) {
+          int block_idx = l + k * x_block;
+          dst[i * y_block + k][j * x_block + l] = (src[buffer_idx] >> ((block_idx % ratio) * DST_T_WIDTH)) & mask;
+          if (block_idx % ratio == ratio - 1) {
             buffer_idx++;
           }
-        } else {
-          for (int l = 0; l < x_block; l++) {
-            dst[i * y_block + k][j * x_block + l] = src[buffer_idx++];
-          }
         }
       }
     }
   }
 }
 
-template <typename T, int T_WIDTH>
+template <typename T>
 T ** allocInit2dArray(int rows, int cols) {
   // Allocate
   T **array = static_cast<T **>(malloc(sizeof(T *) * rows));
@@ -209,8 +201,23 @@ T ** allocInit2dArray(int rows, int cols) {
   // Init
   for (int i = 0; i < rows; i++) {
     for (int j = 0; j < cols; j++) {
-      array[i][j] =
-          static_cast<T>(rand_r(&globalSeed) % (1LL << (T_WIDTH - 1)) - (1LL << (T_WIDTH - 2)));
+      array[i][j] = static_cast<T>(rand_r(&globalSeed));
+    }
+  }
+  return array;
+}
+
+template <typename T>
+T ** allocSet2dArray(int rows, int cols, int val) {
+  // Allocate
+  T **array = static_cast<T **>(malloc(sizeof(T *) * rows));
+  for (int i = 0; i < rows; i++) {
+    array[i] = static_cast<T *>(malloc(sizeof(T) * cols));
+  }
+  // Init
+  for (int i = 0; i < rows; i++) {
+    for (int j = 0; j < cols; j++) {
+      array[i][j] = static_cast<T>(val);
     }
   }
   return array;
@@ -563,45 +570,6 @@ void printParameters() {
   printf("VTA_ACC_ELEM_BYTES: %d\n", VTA_ACC_ELEM_BYTES);
   printf("VTA_BLOCK_IN: %d\n", VTA_BLOCK_IN);
   printf("VTA_BLOCK_OUT: %d\n", VTA_BLOCK_OUT);
-  printf("VTA_INSN_MEM_0 [%d-%d]\n", VTA_INSN_MEM_0_0, VTA_INSN_MEM_0_1);
-  printf("VTA_INSN_MEM_1 [%d]\n", VTA_INSN_MEM_1);
-  printf("VTA_INSN_MEM_2 [%d]\n", VTA_INSN_MEM_2);
-  printf("VTA_INSN_MEM_3 [%d]\n", VTA_INSN_MEM_3);
-  printf("VTA_INSN_MEM_4 [%d]\n", VTA_INSN_MEM_4);
-  printf("VTA_INSN_MEM_5 [%d-%d]\n", VTA_INSN_MEM_5_0, VTA_INSN_MEM_5_1);
-  printf("VTA_INSN_MEM_6 [%d-%d]\n", VTA_INSN_MEM_6_0, VTA_INSN_MEM_6_1);
-  printf("VTA_INSN_MEM_7 [%d-%d]\n", VTA_INSN_MEM_7_0, VTA_INSN_MEM_7_1);
-  printf("VTA_INSN_MEM_8 [%d-%d]\n", VTA_INSN_MEM_8_0, VTA_INSN_MEM_8_1);
-  printf("VTA_INSN_MEM_9 [%d-%d]\n", VTA_INSN_MEM_9_0, VTA_INSN_MEM_9_1);
-  printf("VTA_INSN_MEM_A [%d-%d]\n", VTA_INSN_MEM_A_0, VTA_INSN_MEM_A_1);
-  printf("VTA_INSN_MEM_B [%d-%d]\n", VTA_INSN_MEM_B_0, VTA_INSN_MEM_B_1);
-  printf("VTA_INSN_MEM_C [%d-%d]\n", VTA_INSN_MEM_C_0, VTA_INSN_MEM_C_1);
-  printf("VTA_INSN_MEM_D [%d-%d]\n", VTA_INSN_MEM_D_0, VTA_INSN_MEM_D_1);
-  printf("VTA_INSN_MEM_E [%d-%d]\n", VTA_INSN_MEM_E_0, VTA_INSN_MEM_E_1);
-  printf("VTA_INSN_GEM_0 [%d-%d]\n", VTA_INSN_GEM_0_0, VTA_INSN_GEM_0_1);
-  printf("VTA_INSN_GEM_1 [%d]\n", VTA_INSN_GEM_1);
-  printf("VTA_INSN_GEM_2 [%d]\n", VTA_INSN_GEM_2);
-  printf("VTA_INSN_GEM_3 [%d]\n", VTA_INSN_GEM_3);
-  printf("VTA_INSN_GEM_4 [%d]\n", VTA_INSN_GEM_4);
-  printf("VTA_INSN_GEM_5 [%d]\n", VTA_INSN_GEM_5);
-  printf("VTA_INSN_GEM_6 [%d-%d]\n", VTA_INSN_GEM_6_0, VTA_INSN_GEM_6_1);
-  printf("VTA_INSN_GEM_7 [%d-%d]\n", VTA_INSN_GEM_7_0, VTA_INSN_GEM_7_1);
-  printf("VTA_INSN_GEM_8 [%d-%d]\n", VTA_INSN_GEM_8_0, VTA_INSN_GEM_8_1);
-  printf("VTA_INSN_GEM_9 [%d-%d]\n", VTA_INSN_GEM_9_0, VTA_INSN_GEM_9_1);
-  printf("VTA_INSN_GEM_A [%d-%d]\n", VTA_INSN_GEM_A_0, VTA_INSN_GEM_A_1);
-  printf("VTA_INSN_GEM_B [%d-%d]\n", VTA_INSN_GEM_B_0, VTA_INSN_GEM_B_1);
-  printf("VTA_INSN_GEM_C [%d-%d]\n", VTA_INSN_GEM_C_0, VTA_INSN_GEM_C_1);
-  printf("VTA_INSN_GEM_D [%d-%d]\n", VTA_INSN_GEM_D_0, VTA_INSN_GEM_D_1);
-  printf("VTA_INSN_GEM_E [%d-%d]\n", VTA_INSN_GEM_E_0, VTA_INSN_GEM_E_1);
-  printf("VTA_INSN_GEM_F [%d-%d]\n", VTA_INSN_GEM_F_0, VTA_INSN_GEM_F_1);
-  printf("VTA_INSN_ALU_E [%d-%d]\n", VTA_INSN_ALU_E_0, VTA_INSN_ALU_E_1);
-  printf("VTA_INSN_ALU_F [%d]\n", VTA_INSN_ALU_F);
-  printf("VTA_INSN_ALU_G [%d-%d]\n", VTA_INSN_ALU_G_0, VTA_INSN_ALU_G_1);
-  printf("VTA_UOP_GEM_0 [%d-%d]\n", VTA_UOP_GEM_0_0, VTA_UOP_GEM_0_1);
-  printf("VTA_UOP_GEM_1 [%d-%d]\n", VTA_UOP_GEM_1_0, VTA_UOP_GEM_1_1);
-  printf("VTA_UOP_GEM_2 [%d-%d]\n", VTA_UOP_GEM_2_0, VTA_UOP_GEM_2_1);
-  printf("VTA_UOP_ALU_0 [%d-%d]\n", VTA_UOP_ALU_0_0, VTA_UOP_ALU_0_1);
-  printf("VTA_UOP_ALU_1 [%d-%d]\n", VTA_UOP_ALU_1_0, VTA_UOP_ALU_1_1);
 }
 
 void printInstruction(int num_insn, VTAGenericInsn *insns) {
@@ -742,7 +710,6 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
   // Some assertions
   assert(batch % VTA_BATCH == 0);
   assert(vector_size % VTA_BLOCK_OUT == 0);
-  assert(!(opcode == VTA_ALU_OPCODE_SHR && !use_imm));
   printf("=====================================================================================\n");
   printf("INFO - ALU test of %s: batch=%d, vector_size=%d, uop_compression=%d\n",
     getOpcodeString(opcode, use_imm), batch, vector_size, uop_compression);
@@ -764,17 +731,21 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
   for (int b = 0; b < batch / VTA_BATCH; b++) {
     if (opcode == VTA_ALU_OPCODE_MIN) {
       immediate[b] = static_cast<acc_T>(
-          rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1)));
+          rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2)));
     } else if (opcode == VTA_ALU_OPCODE_MAX) {
       immediate[b] = static_cast<acc_T>(
-          rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1)));
+          rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2)));
     } else if (opcode == VTA_ALU_OPCODE_ADD) {
       immediate[b] = static_cast<acc_T>(
-          rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1)));
+          rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2)));
     } else if (opcode == VTA_ALU_OPCODE_SHR) {
       immediate[b] = static_cast<acc_T>(
-          rand_r(&globalSeed) % VTA_ACC_WIDTH - VTA_ACC_WIDTH/2);
+          rand_r(&globalSeed) % (1LL << (VTA_SHR_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_SHR_ARG_BIT_WIDTH - 2)));
     }
+    // else if (opcode == VTA_ALU_OPCODE_MUL) {
+    //   immediate[b] = static_cast<acc_T>(
+    //       rand_r(&globalSeed) % (1LL << (VTA_MUL_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_MUL_ARG_BIT_WIDTH - 2)));
+    // }
   }
 
   // Initialize instructions
@@ -845,7 +816,10 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
             rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
       } else if (opcode == VTA_ALU_OPCODE_ADD) {
         inputs[i][j] = static_cast<acc_T>(
-            rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
+            rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 2)) - (1LL << (VTA_INP_WIDTH - 3)));
+      } else if (opcode == VTA_ALU_OPCODE_SHR) {
+        inputs[i][j] = static_cast<acc_T>(
+            rand_r(&globalSeed) % (1LL << (VTA_SHR_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_SHR_ARG_BIT_WIDTH - 2)));
       }
     }
   }
@@ -854,54 +828,55 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
   out_T **outputs_ref = alloc2dArray<out_T>(batch, vector_size);
   for (int i = 0; i < batch; i++) {
     for (int j = 0; j < vector_size; j++) {
-      acc_T tmp = 0;
+      acc_T out_val = 0;
+      acc_T imm_val = immediate[i / VTA_BATCH];
+      acc_T src_val = inputs[i][j + vector_size];
       if (opcode == VTA_ALU_OPCODE_MIN) {
         if (!use_imm) {
-          tmp = inputs[i][j] < inputs[i][j + vector_size] ?
-                    inputs[i][j] :
-                    inputs[i][j + vector_size];
+          out_val = inputs[i][j] < src_val ? inputs[i][j] : src_val;
         } else {
-          tmp = inputs[i][j] < immediate[i / VTA_BATCH] ?
-                    inputs[i][j] :
-                    immediate[i / VTA_BATCH];
+          out_val = inputs[i][j] < imm_val ? inputs[i][j] : imm_val;
         }
       } else if (opcode == VTA_ALU_OPCODE_MAX) {
         if (!use_imm) {
-          tmp = inputs[i][j] > inputs[i][j + vector_size] ?
-                    inputs[i][j] :
-                    inputs[i][j + vector_size];
+          out_val = inputs[i][j] > src_val ? inputs[i][j] : src_val;
         } else {
-          tmp = inputs[i][j] > immediate[i / VTA_BATCH] ?
-                    inputs[i][j] :
-                    immediate[i / VTA_BATCH];
+          out_val = inputs[i][j] > imm_val ? inputs[i][j] : imm_val;
         }
       } else if (opcode == VTA_ALU_OPCODE_ADD) {
         if (!use_imm) {
-          tmp = inputs[i][j] + inputs[i][j + vector_size];
+          out_val = inputs[i][j] + src_val;
         } else {
-          tmp = inputs[i][j] + immediate[i / VTA_BATCH];
+          out_val = inputs[i][j] + imm_val;
         }
       } else if (opcode == VTA_ALU_OPCODE_SHR) {
-        if (immediate[i / VTA_BATCH] >= 0) {
-          tmp = inputs[i][j] >> immediate[i / VTA_BATCH];
+        if (!use_imm) {
+          if (src_val >= 0) {
+            out_val = inputs[i][j] >> src_val;
+          } else {
+            out_val = inputs[i][j] << (0 - src_val);
+          }
         } else {
-          tmp = inputs[i][j] << (0 - immediate[i / VTA_BATCH]);
+          if (imm_val >= 0) {
+            out_val = inputs[i][j] >> imm_val;
+          } else {
+            out_val = inputs[i][j] << (0 - imm_val);
+          }
         }
       }
-      // Set
-      outputs_ref[i][j] = (out_T) tmp;
+      outputs_ref[i][j] = (out_T) out_val;
     }
   }
 
   // Pack input buffer
-  acc_T *bias_buf =
-      static_cast<acc_T *>(allocBuffer(VTA_ACC_ELEM_BYTES * batch * tx_size * input_sets));
-  packBuffer<acc_T, VTA_ACC_WIDTH>(
+  uint32_t *bias_buf = static_cast<uint32_t *>(
+      allocBuffer(VTA_ACC_ELEM_BYTES * batch * tx_size * input_sets));
+  packBuffer<uint32_t, 32, acc_T, VTA_ACC_WIDTH>(
       bias_buf, inputs, batch, vector_size * input_sets, VTA_BATCH, VTA_BLOCK_OUT);
 
   // Prepare output buffer
-  out_T *output_buf =
-      static_cast<out_T *>(allocBuffer(VTA_INP_ELEM_BYTES * batch * tx_size * input_sets));
+  uint32_t *output_buf = static_cast<uint32_t *>(
+      allocBuffer(VTA_OUT_ELEM_BYTES * batch * tx_size * input_sets));
 
 #ifdef NO_SIM
   // Invoke the VTA
@@ -914,20 +889,20 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
   vta(ins_size,
       (volatile insn_T *) insn_buf,
       (volatile uop_T *) uop_buf,
-      (volatile inp_vec_T *) NULL,
-      (volatile wgt_vec_T *) NULL,
-      (volatile acc_vec_T *) bias_buf,
-      (volatile out_vec_T *) output_buf);
+      (volatile bus_T *) NULL,
+      (volatile bus_T *) NULL,
+      (volatile bus_T *) bias_buf,
+      (volatile bus_T *) output_buf);
 #endif
 
   // Unpack output buffer
   out_T **outputs = alloc2dArray<out_T>(batch, vector_size);
-  unpackBuffer<out_T, VTA_OUT_WIDTH>(outputs,
-                                     output_buf,
-                                     batch,
-                                     vector_size,
-                                     VTA_BATCH,
-                                     VTA_BLOCK_OUT);
+  unpackBuffer<out_T, VTA_OUT_WIDTH, uint32_t, 32>(outputs,
+                                                   output_buf,
+                                                   batch,
+                                                   vector_size,
+                                                   VTA_BATCH,
+                                                   VTA_BLOCK_OUT);
 
   // Correctness checks
   int err = 0;
@@ -1123,11 +1098,11 @@ int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
 #endif
 
   // Initialize inputs
-  inp_T **inputs = allocInit2dArray<inp_T, VTA_INP_WIDTH>(batch, in_feat);
+  inp_T **inputs = allocInit2dArray<inp_T>(batch, in_feat);
   // Initialize weights
-  wgt_T **weights = allocInit2dArray<wgt_T, VTA_WGT_WIDTH>(out_feat, in_feat);
+  wgt_T **weights = allocInit2dArray<wgt_T>(out_feat, in_feat);
   // Initialize biases
-  acc_T **biases = allocInit2dArray<acc_T, VTA_ACC_WIDTH>(batch, out_feat);
+  acc_T **biases = allocInit2dArray<acc_T>(batch, out_feat);
 
   // Reference GEMM implementation
   out_T **outputs_ref = alloc2dArray<out_T>(batch, out_feat);
@@ -1143,31 +1118,35 @@ int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
   }
 
   // Prepare the input buffer
-  inp_T *input_buf = static_cast<inp_T *>(allocBuffer(VTA_INP_ELEM_BYTES * inp_size));
-  packBuffer<inp_T, VTA_INP_WIDTH>(input_buf,
-                                   inputs,
-                                   batch,
-                                   in_feat,
-                                   VTA_BATCH,
-                                   VTA_BLOCK_IN);
+  uint32_t *input_buf = static_cast<uint32_t *>(
+      allocBuffer(VTA_INP_ELEM_BYTES * inp_size));
+  packBuffer<uint32_t, 32, inp_T, VTA_INP_WIDTH>(input_buf,
+                                                 inputs,
+                                                 batch,
+                                                 in_feat,
+                                                 VTA_BATCH,
+                                                 VTA_BLOCK_IN);
   // Prepare the weight buffer
-  wgt_T *weight_buf = static_cast<wgt_T *>(allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size));
-  packBuffer<wgt_T, VTA_WGT_WIDTH>(weight_buf,
-                                   weights,
-                                   out_feat,
-                                   in_feat,
-                                   VTA_BLOCK_OUT,
-                                   VTA_BLOCK_IN);
+  uint32_t *weight_buf = static_cast<uint32_t *>(
+      allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size));
+  packBuffer<uint32_t, 32, wgt_T, VTA_WGT_WIDTH>(weight_buf,
+                                                 weights,
+                                                 out_feat,
+                                                 in_feat,
+                                                 VTA_BLOCK_OUT,
+                                                 VTA_BLOCK_IN);
   // Prepare the bias buffer
-  acc_T *bias_buf = static_cast<acc_T *>(allocBuffer(VTA_ACC_ELEM_BYTES * out_size));
-  packBuffer<acc_T, VTA_ACC_WIDTH>(bias_buf,
-                                   biases,
-                                   batch,
-                                   out_feat,
-                                   VTA_BATCH,
-                                   VTA_BLOCK_OUT);
+  uint32_t *bias_buf = static_cast<uint32_t *>(
+      allocBuffer(VTA_ACC_ELEM_BYTES * out_size));
+  packBuffer<uint32_t, 32, acc_T, VTA_ACC_WIDTH>(bias_buf,
+                                                 biases,
+                                                 batch,
+                                                 out_feat,
+                                                 VTA_BATCH,
+                                                 VTA_BLOCK_OUT);
   // Prepare the output buffer
-  out_T *output_buf = static_cast<out_T *>(allocBuffer(VTA_INP_ELEM_BYTES * out_size));
+  uint32_t *output_buf = static_cast<uint32_t *>(
+      allocBuffer(VTA_INP_ELEM_BYTES * out_size));
 
 #ifdef NO_SIM
   // Invoke the VTA
@@ -1187,20 +1166,20 @@ int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
   vta(ins_size,
       (volatile insn_T *) insn_buf,
       (volatile uop_T *) uop_buf,
-      (volatile inp_vec_T *) input_buf,
-      (volatile wgt_vec_T *) weight_buf,
-      (volatile acc_vec_T *) bias_buf,
-      (volatile out_vec_T *) output_buf);
+      (volatile bus_T *) input_buf,
+      (volatile bus_T *) weight_buf,
+      (volatile bus_T *) bias_buf,
+      (volatile bus_T *) output_buf);
 #endif
 
   // Unpack output data
   out_T **outputs = alloc2dArray<out_T>(batch, out_feat);
-  unpackBuffer<out_T, VTA_OUT_WIDTH>(outputs,
-                                     output_buf,
-                                     batch,
-                                     out_feat,
-                                     VTA_BATCH,
-                                     VTA_BLOCK_OUT);
+  unpackBuffer<out_T, VTA_OUT_WIDTH, uint32_t, 32>(outputs,
+                                                   output_buf,
+                                                   batch,
+                                                   out_feat,
+                                                   VTA_BATCH,
+                                                   VTA_BLOCK_OUT);
 
   // Correctness checks
   int err = 0;
@@ -1352,11 +1331,11 @@ int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression
 #endif
 
   // Initialize inputs
-  inp_T **inputs = allocInit2dArray<inp_T, VTA_INP_WIDTH>(batch, in_channels);
+  inp_T **inputs = allocInit2dArray<inp_T>(batch, in_channels);
   // Initialize weights
-  wgt_T **weights = allocInit2dArray<wgt_T, VTA_WGT_WIDTH>(out_channels, in_channels);
+  wgt_T **weights = allocInit2dArray<wgt_T>(out_channels, in_channels);
   // Initialize biases
-  acc_T **biases = allocInit2dArray<acc_T, VTA_ACC_WIDTH>(batch, out_channels);
+  acc_T **biases = allocInit2dArray<acc_T>(batch, out_channels);
 
   // Reference GEMM implementation
   out_T **outputs_ref = alloc2dArray<out_T>(batch, out_channels);
@@ -1372,31 +1351,31 @@ int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression
   }
 
   // Prepare the input buffer
-  inp_T *input_buf = static_cast<inp_T *>(allocBuffer(VTA_INP_ELEM_BYTES * inp_size));
-  packBuffer<inp_T, VTA_INP_WIDTH>(input_buf,
-                                   inputs,
-                                   batch,
-                                   in_channels,
-                                   VTA_BATCH,
-                                   VTA_BLOCK_IN);
+  uint32_t *input_buf = static_cast<uint32_t *>(allocBuffer(VTA_INP_ELEM_BYTES * inp_size));
+  packBuffer<uint32_t, 32, inp_T, VTA_INP_WIDTH>(input_buf,
+                                                 inputs,
+                                                 batch,
+                                                 in_channels,
+                                                 VTA_BATCH,
+                                                 VTA_BLOCK_IN);
   // Prepare the weight buffer
-  wgt_T *weight_buf = static_cast<wgt_T *>(allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size));
-  packBuffer<wgt_T, VTA_WGT_WIDTH>(weight_buf,
-                                   weights,
-                                   out_channels,
-                                   in_channels,
-                                   VTA_BLOCK_OUT,
-                                   VTA_BLOCK_IN);
+  uint32_t *weight_buf = static_cast<uint32_t *>(allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size));
+  packBuffer<uint32_t, 32, wgt_T, VTA_WGT_WIDTH>(weight_buf,
+                                                 weights,
+                                                 out_channels,
+                                                 in_channels,
+                                                 VTA_BLOCK_OUT,
+                                                 VTA_BLOCK_IN);
   // Prepare the bias buffer
-  acc_T *bias_buf = static_cast<acc_T *>(allocBuffer(VTA_ACC_ELEM_BYTES * out_size));
-  packBuffer<acc_T, VTA_ACC_WIDTH>(bias_buf,
-                                   biases,
-                                   batch,
-                                   out_channels,
-                                   VTA_BATCH,
-                                   VTA_BLOCK_OUT);
+  uint32_t *bias_buf = static_cast<uint32_t *>(allocBuffer(VTA_ACC_ELEM_BYTES * out_size));
+  packBuffer<uint32_t, 32, acc_T, VTA_ACC_WIDTH>(bias_buf,
+                                                 biases,
+                                                 batch,
+                                                 out_channels,
+                                                 VTA_BATCH,
+                                                 VTA_BLOCK_OUT);
   // Prepare the output buffer
-  out_T *output_buf = static_cast<out_T *>(allocBuffer(VTA_INP_ELEM_BYTES * out_size));
+  uint32_t *output_buf = static_cast<uint32_t *>(allocBuffer(VTA_OUT_ELEM_BYTES * out_size));
 
 #ifdef NO_SIM
   // Invoke the VTA
@@ -1416,20 +1395,20 @@ int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression
   vta(ins_size,
       (volatile insn_T *) insn_buf,
       (volatile uop_T *) uop_buf,
-      (volatile inp_vec_T *) input_buf,
-      (volatile wgt_vec_T *) weight_buf,
-      (volatile acc_vec_T *) bias_buf,
-      (volatile out_vec_T *) output_buf);
+      (volatile bus_T *) input_buf,
+      (volatile bus_T *) weight_buf,
+      (volatile bus_T *) bias_buf,
+      (volatile bus_T *) output_buf);
 #endif
 
   // Unpack output data
   out_T **outputs = alloc2dArray<out_T>(batch, out_channels);
-  unpackBuffer<out_T, VTA_OUT_WIDTH>(outputs,
-                                     output_buf,
-                                     batch,
-                                     out_channels,
-                                     VTA_BATCH,
-                                     VTA_BLOCK_OUT);
+  unpackBuffer<out_T, VTA_OUT_WIDTH, uint32_t, 32>(outputs,
+                                                   output_buf,
+                                                   batch,
+                                                   out_channels,
+                                                   VTA_BATCH,
+                                                   VTA_BLOCK_OUT);
 
   // Correctness checks
   int err = 0;
diff --git a/vta/tests/hardware/common/test_lib.h b/vta/tests/hardware/common/test_lib.h
index ee8c34009057..e4ba9c9944fb 100644
--- a/vta/tests/hardware/common/test_lib.h
+++ b/vta/tests/hardware/common/test_lib.h
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2018 by Contributors
  * \file test_lib.cpp
  * \brief Test library for the VTA design simulation and driver tests.
  */
@@ -40,7 +39,6 @@
 #include "../../../src/pynq/pynq_driver.h"
 #endif  // VTA_TARGET_PYNQ
 
-typedef uint64_t axi_T;
 typedef uint32_t uop_T;
 typedef int8_t wgt_T;
 typedef int8_t inp_T;
@@ -95,14 +93,24 @@ template <typename T, int T_WIDTH>
 void unpackBuffer(T **dst, T *src, int y_size, int x_size, int y_block, int x_block);
 
 /*!
-* \brief Allocates and initializes a 2D array in the heap.
+* \brief Allocates and randomly initializes a 2D array in the heap.
 * \param rows Number of rows.
 * \param cols Number of columns.
 * \return Pointer to the 2D array.
 */
-template <typename T, int T_WIDTH>
+template <typename T>
 T ** allocInit2dArray(int rows, int cols);
 
+/*!
+* \brief Allocates and initializes a 2D array to a set value in the heap.
+* \param rows Number of rows.
+* \param cols Number of columns.
+* \param val Value to set the whole array to.
+* \return Pointer to the 2D array.
+*/
+template <typename T>
+T ** allocSet2dArray(int rows, int cols, int val);
+
 /*!
 * \brief Allocates a 2D array in the heap.
 * \param rows Number of rows.
diff --git a/vta/tests/python/unittest/test_environment.py b/vta/tests/python/unittest/test_environment.py
index d5f7a6f43be9..605a9e0dfcdd 100644
--- a/vta/tests/python/unittest/test_environment.py
+++ b/vta/tests/python/unittest/test_environment.py
@@ -24,7 +24,7 @@ def test_env():
 
 def test_env_scope():
     env = vta.get_env()
-    cfg = env.pkg_config().cfg_dict
+    cfg = env.cfg_dict
     cfg["TARGET"] = "xyz"
     with vta.Environment(cfg):
         assert vta.get_env().TARGET == "xyz"
diff --git a/vta/tutorials/frontend/deploy_resnet_on_vta.py b/vta/tutorials/frontend/deploy_resnet_on_vta.py
index 393574932841..b21d205bd9d8 100644
--- a/vta/tutorials/frontend/deploy_resnet_on_vta.py
+++ b/vta/tutorials/frontend/deploy_resnet_on_vta.py
@@ -100,9 +100,9 @@
     # the host, make sure you've set the variables below to the IP of
     # your board.
     device_host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
-    device_port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091"))
+    device_port = os.environ.get("VTA_PYNQ_RPC_PORT", "9091")
     if not tracker_host or not tracker_port:
-        remote = rpc.connect(device_host, device_port)
+        remote = rpc.connect(device_host, int(device_port))
     else:
         remote = autotvm.measure.request_remote(env.TARGET, tracker_host, int(tracker_port), timeout=10000)