apache · tmoreau89 · Sep 7, 2019 · Jul 16, 2019 · Aug 29, 2019 · tqchen
diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
@@ -43,6 +43,7 @@ elseif(PYTHON)
     file(GLOB FSIM_RUNTIME_SRCS vta/src/*.cc)
     list(APPEND FSIM_RUNTIME_SRCS vta/src/sim/sim_driver.cc)
     list(APPEND FSIM_RUNTIME_SRCS vta/src/vmem/virtual_memory.cc vta/src/vmem/virtual_memory.h)
+    list(APPEND FSIM_RUNTIME_SRCS vta/src/sim/sim_tlpp.cc)
     # Target lib: vta_fsim
     add_library(vta_fsim SHARED ${FSIM_RUNTIME_SRCS})
     target_include_directories(vta_fsim PUBLIC vta/include)
@@ -54,6 +55,7 @@ elseif(PYTHON)
     if(APPLE)
       set_target_properties(vta_fsim PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
     endif(APPLE)
+    target_compile_definitions(vta_fsim PUBLIC USE_FSIM_TLPP)
   endif()
 
   # Cycle accurate simulator driver build

diff --git a/vta/include/vta/sim_tlpp.h b/vta/include/vta/sim_tlpp.h
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file sim_tlpp.h
+ * \brief TVM VTA multiple thread simulator header file.
+ */
+#ifndef VTA_SIM_TLPP_H_
+#define VTA_SIM_TLPP_H_
+#include <vta/hw_spec.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <vector>
+#include <ctime>
+#include <cassert>
+#include <queue>
+
+#define SCOREGEMM "gemm"
+#define SCORELOAD "load"
+#define SCORESTORE "store"
+#define SCOREUNKNOWN "unknown"
+typedef void (*Run_Function)(const VTAGenericInsn *, void *);
+typedef enum {COREGEMM = 0, CORELOAD, CORESTORE, COREMAX} CORE_TYPE;
+typedef std::queue<const void*> Insn_q_t;
+typedef std::queue<int> Dep_q_t;
+/*!
+ * \brief simulate core level pipe line parallism logic.
+ */
+class TlppVerify {
+ public:
+    /*! Return TlppVefiy class instance.*/
+    static TlppVerify *Global() { static TlppVerify Cls; return &Cls;}
+
+    /*! 
+     *  \brief Loop to process instruction and verify tlpp logic.
+     *  \param run_function function pointer to excute instruction .
+     *  \param fsim_handle class pointer of function simulator class Device.
+     *  \param debug to enable/disable debug
+     */
+    void TlppSynchronization(Run_Function run_function,
+                             void *fsim_handle,
+                             bool debug = false);
+    /*!
+     *  \brief Push instruction into queue for later excute.
+     *  \param insn instructions.
+     */
+    void TlppPushInsn(const VTAGenericInsn *insn);
+    /*! \ Event pump to handle dependency event. */
+    void EventProcess(void);
+    /*! \ Schedule a paticular core to run. */
+    void CoreRun(CORE_TYPE core_type);
+
+ private:
+    /*! TlppVerify construction function.*/
+    TlppVerify();
+    /*!
+     * \brief clear class variable.
+     */
+    void Clear();
+    /*!
+     * \ brief check if the insn dependency condition satisfy and do notify.
+     * \ param insn instructions.
+     * \ param before_run identify this check is happen before
+     *   instruction excute or after instruction excute, for before
+     *   scenario need to check if depency condition satisfy, for post
+     *   case need to check if need to send notfication.
+     */
+    bool InsnDependencyCheck(const VTAGenericInsn *insn, bool before_run);
+    /*!
+     * \ brief get operation code from insn
+     * \ param insn instructions
+     */
+    uint64_t GetOperationCode(const VTAGenericInsn *insn);
+    /*!
+     * \ brief find which core should run this instruction.
+     * \ param operation_code operation type like load/gemm etc.
+     * \ param insn instructions.
+     */
+    CORE_TYPE GetCoreType(uint64_t operation_code, const VTAGenericInsn *insn);
+    /*!
+     * \ brief , pick up first instruction for specify core.
+     * \ param core_type core type
+     */
+    const VTAGenericInsn *PickFrontInsn(uint64_t core_type);
+    /*!
+     * \ brief consume one instruction after pass dependency condition.
+     * \ param core_type core type
+     */
+    void ConsumeFrontInsn(uint64_t core_type);
+    /*!
+     * \ brief, process dependency logic
+     * param before_run if this call happen before instruction run.
+     * param pop_prev if instruction have previous core dependency.
+     * param pop_next if instruction have depency for next core.
+     * param pop_prev_q notification from previous core.
+     * param pop_next_q notification from next core.
+     * param push_prev_q notification queue need to send notification
+     * for prevous core.
+     * param push_next_q notification queue need to send notification
+     * from next core.
+     * push_to_prev_q_indx which core need wake up if have notification
+     * fro previous core.
+     * push_to_next_q_indx which core need wake up if have notification
+     * fro next core.
+     */
+    bool DependencyProcess(bool before_run,
+        bool pop_prev, bool pop_next,
+        bool push_prev, bool push_next,
+        Dep_q_t *pop_prev_q, Dep_q_t *pop_next_q,
+        Dep_q_t *push_prev_q, Dep_q_t *push_next_q,
+        CORE_TYPE push_to_prev_q_indx, CORE_TYPE push_to_next_q_indx);
+    /*!
+     * \ brief , return name based on core type.
+     * \ param core_type core type
+     */
+    inline const char * GetCoreTypeName(CORE_TYPE core_type) {
+      return (core_type == COREGEMM) ? SCOREGEMM :
+        (core_type == CORELOAD) ? SCORELOAD :
+        (core_type == CORESTORE) ? SCORESTORE :
+        SCOREUNKNOWN;
+    }
+    /*! debug flag*/
+    bool debug_;
+    /*! function simulator device class pointer*/
+    void *fsim_handle_;
+    /*! function simulator instruction excute function pointer*/
+    Run_Function run_fsim_function_;
+    /*! instruction queue for each core*/
+    Insn_q_t insnq_array_[COREMAX];
+    /*! dependency queue from load to gemm*/
+    Dep_q_t l2g_q_;
+    /*! dependency queue from store to gemm*/
+    Dep_q_t s2g_q_;
+    /*! dependency queue from gemm to load*/
+    Dep_q_t g2l_q_;
+    /*! dependency queue from gemm to store*/
+    Dep_q_t g2s_q_;
+    /*! computation done*/
+    int done_;
+    /*! event queue for core wake up*/
+    std::queue<CORE_TYPE> dep_push_event_;
+};
+#endif  // VTA_SIM_TLPP_H_
diff --git a/vta/src/sim/sim_driver.cc b/vta/src/sim/sim_driver.cc
@@ -25,6 +25,7 @@
 #include <vta/driver.h>
 #include <vta/hw_spec.h>
 #include <tvm/runtime/registry.h>
+#include <vta/sim_tlpp.h>
 #include <type_traits>
 #include <mutex>
 #include <map>
@@ -275,6 +276,7 @@ class Device {
   Device() {
     prof_ = Profiler::ThreadLocal();
     dram_ = DRAM::Global();
+    ptlpp = TlppVerify::Global();
   }
 
   int Run(vta_phy_addr_t insn_phy_addr,
@@ -286,26 +288,37 @@ class Device {
     for (uint32_t i = 0; i < insn_count; ++i) {
       this->Run(insn + i);
     }
+    this->TlppSynchronization();
     return 0;
   }
 
  private:
-  void Run(const VTAGenericInsn* insn) {
+  static void Run_Insn(const VTAGenericInsn* insn, void * dev) {
+    Device * device = reinterpret_cast<Device *> (dev);
     const VTAMemInsn* mem = reinterpret_cast<const VTAMemInsn*>(insn);
     const VTAGemInsn* gem = reinterpret_cast<const VTAGemInsn*>(insn);
     const VTAAluInsn* alu = reinterpret_cast<const VTAAluInsn*>(insn);
     switch (mem->opcode) {
-      case VTA_OPCODE_LOAD: RunLoad(mem); break;
-      case VTA_OPCODE_STORE: RunStore(mem); break;
-      case VTA_OPCODE_GEMM: RunGEMM(gem); break;
-      case VTA_OPCODE_ALU: RunALU(alu); break;
-      case VTA_OPCODE_FINISH: ++finish_counter_; break;
+      case VTA_OPCODE_LOAD: device->RunLoad(mem); break;
+      case VTA_OPCODE_STORE: device->RunStore(mem); break;
+      case VTA_OPCODE_GEMM: device->RunGEMM(gem); break;
+      case VTA_OPCODE_ALU: device->RunALU(alu); break;
+      case VTA_OPCODE_FINISH: ++(device->finish_counter_); break;
       default: {
         LOG(FATAL) << "Unknown op_code" << mem->opcode;
       }
     }
   }
 
+ private:
+  void Run(const VTAGenericInsn* insn) {
+    ptlpp->TlppPushInsn(insn);
+  }
+
+  void TlppSynchronization(void) {
+    ptlpp->TlppSynchronization(Run_Insn, reinterpret_cast<void *> (this));
+  }
+
   void RunLoad(const VTAMemInsn* op) {
     if (op->x_size == 0) return;
     if (op->memory_type == VTA_MEM_ID_INP) {
@@ -466,6 +479,7 @@ class Device {
   Profiler* prof_;
   // The DRAM interface
   DRAM* dram_;
+  TlppVerify* ptlpp;
   // The SRAM
   SRAM<VTA_INP_WIDTH, VTA_BATCH * VTA_BLOCK_IN, VTA_INP_BUFF_DEPTH> inp_;
   SRAM<VTA_WGT_WIDTH, VTA_BLOCK_IN * VTA_BLOCK_OUT, VTA_WGT_BUFF_DEPTH> wgt_;