diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
index b8be98a53a74..45eea57a53ad 100644
--- a/cmake/modules/VTA.cmake
+++ b/cmake/modules/VTA.cmake
@@ -43,6 +43,7 @@ elseif(PYTHON)
     file(GLOB FSIM_RUNTIME_SRCS vta/src/*.cc)
     list(APPEND FSIM_RUNTIME_SRCS vta/src/sim/sim_driver.cc)
     list(APPEND FSIM_RUNTIME_SRCS vta/src/vmem/virtual_memory.cc vta/src/vmem/virtual_memory.h)
+    list(APPEND FSIM_RUNTIME_SRCS vta/src/sim/sim_tlpp.cc)
     # Target lib: vta_fsim
     add_library(vta_fsim SHARED ${FSIM_RUNTIME_SRCS})
     target_include_directories(vta_fsim PUBLIC vta/include)
@@ -54,6 +55,7 @@ elseif(PYTHON)
     if(APPLE)
       set_target_properties(vta_fsim PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
     endif(APPLE)
+    target_compile_definitions(vta_fsim PUBLIC USE_FSIM_TLPP)
   endif()
 
   # Cycle accurate simulator driver build
diff --git a/vta/include/vta/sim_tlpp.h b/vta/include/vta/sim_tlpp.h
new file mode 100644
index 000000000000..ead07f18028f
--- /dev/null
+++ b/vta/include/vta/sim_tlpp.h
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file sim_tlpp.h
+ * \brief TVM VTA multiple thread simulator header file.
+ */
+#ifndef VTA_SIM_TLPP_H_
+#define VTA_SIM_TLPP_H_
+#include <vta/hw_spec.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <vector>
+#include <ctime>
+#include <cassert>
+#include <queue>
+
+#define SCOREGEMM "gemm"
+#define SCORELOAD "load"
+#define SCORESTORE "store"
+#define SCOREUNKNOWN "unknown"
+typedef void (*Run_Function)(const VTAGenericInsn *, void *);
+typedef enum {COREGEMM = 0, CORELOAD, CORESTORE, COREMAX} CORE_TYPE;
+typedef std::queue<const void*> Insn_q_t;
+typedef std::queue<int> Dep_q_t;
+/*!
+ * \brief simulate core level pipe line parallism logic.
+ */
+class TlppVerify {
+ public:
+    /*! Return TlppVefiy class instance.*/
+    static TlppVerify *Global() { static TlppVerify Cls; return &Cls;}
+
+    /*! 
+     *  \brief Loop to process instruction and verify tlpp logic.
+     *  \param run_function function pointer to excute instruction .
+     *  \param fsim_handle class pointer of function simulator class Device.
+     *  \param debug to enable/disable debug
+     */
+    void TlppSynchronization(Run_Function run_function,
+                             void *fsim_handle,
+                             bool debug = false);
+    /*!
+     *  \brief Push instruction into queue for later excute.
+     *  \param insn instructions.
+     */
+    void TlppPushInsn(const VTAGenericInsn *insn);
+    /*! \ Event pump to handle dependency event. */
+    void EventProcess(void);
+    /*! \ Schedule a paticular core to run. */
+    void CoreRun(CORE_TYPE core_type);
+
+ private:
+    /*! TlppVerify construction function.*/
+    TlppVerify();
+    /*!
+     * \brief clear class variable.
+     */
+    void Clear();
+    /*!
+     * \ brief check if the insn dependency condition satisfy and do notify.
+     * \ param insn instructions.
+     * \ param before_run identify this check is happen before
+     *   instruction excute or after instruction excute, for before
+     *   scenario need to check if depency condition satisfy, for post
+     *   case need to check if need to send notfication.
+     */
+    bool InsnDependencyCheck(const VTAGenericInsn *insn, bool before_run);
+    /*!
+     * \ brief get operation code from insn
+     * \ param insn instructions
+     */
+    uint64_t GetOperationCode(const VTAGenericInsn *insn);
+    /*!
+     * \ brief find which core should run this instruction.
+     * \ param operation_code operation type like load/gemm etc.
+     * \ param insn instructions.
+     */
+    CORE_TYPE GetCoreType(uint64_t operation_code, const VTAGenericInsn *insn);
+    /*!
+     * \ brief , pick up first instruction for specify core.
+     * \ param core_type core type
+     */
+    const VTAGenericInsn *PickFrontInsn(uint64_t core_type);
+    /*!
+     * \ brief consume one instruction after pass dependency condition.
+     * \ param core_type core type
+     */
+    void ConsumeFrontInsn(uint64_t core_type);
+    /*!
+     * \ brief, process dependency logic
+     * param before_run if this call happen before instruction run.
+     * param pop_prev if instruction have previous core dependency.
+     * param pop_next if instruction have depency for next core.
+     * param pop_prev_q notification from previous core.
+     * param pop_next_q notification from next core.
+     * param push_prev_q notification queue need to send notification
+     * for prevous core.
+     * param push_next_q notification queue need to send notification
+     * from next core.
+     * push_to_prev_q_indx which core need wake up if have notification
+     * fro previous core.
+     * push_to_next_q_indx which core need wake up if have notification
+     * fro next core.
+     */
+    bool DependencyProcess(bool before_run,
+        bool pop_prev, bool pop_next,
+        bool push_prev, bool push_next,
+        Dep_q_t *pop_prev_q, Dep_q_t *pop_next_q,
+        Dep_q_t *push_prev_q, Dep_q_t *push_next_q,
+        CORE_TYPE push_to_prev_q_indx, CORE_TYPE push_to_next_q_indx);
+    /*!
+     * \ brief , return name based on core type.
+     * \ param core_type core type
+     */
+    inline const char * GetCoreTypeName(CORE_TYPE core_type) {
+      return (core_type == COREGEMM) ? SCOREGEMM :
+        (core_type == CORELOAD) ? SCORELOAD :
+        (core_type == CORESTORE) ? SCORESTORE :
+        SCOREUNKNOWN;
+    }
+    /*! debug flag*/
+    bool debug_;
+    /*! function simulator device class pointer*/
+    void *fsim_handle_;
+    /*! function simulator instruction excute function pointer*/
+    Run_Function run_fsim_function_;
+    /*! instruction queue for each core*/
+    Insn_q_t insnq_array_[COREMAX];
+    /*! dependency queue from load to gemm*/
+    Dep_q_t l2g_q_;
+    /*! dependency queue from store to gemm*/
+    Dep_q_t s2g_q_;
+    /*! dependency queue from gemm to load*/
+    Dep_q_t g2l_q_;
+    /*! dependency queue from gemm to store*/
+    Dep_q_t g2s_q_;
+    /*! computation done*/
+    int done_;
+    /*! event queue for core wake up*/
+    std::queue<CORE_TYPE> dep_push_event_;
+};
+#endif  // VTA_SIM_TLPP_H_
diff --git a/vta/src/sim/sim_driver.cc b/vta/src/sim/sim_driver.cc
index eb497125b144..871097f2ef3f 100644
--- a/vta/src/sim/sim_driver.cc
+++ b/vta/src/sim/sim_driver.cc
@@ -25,6 +25,7 @@
 #include <vta/driver.h>
 #include <vta/hw_spec.h>
 #include <tvm/runtime/registry.h>
+#include <vta/sim_tlpp.h>
 #include <type_traits>
 #include <mutex>
 #include <map>
@@ -275,6 +276,7 @@ class Device {
   Device() {
     prof_ = Profiler::ThreadLocal();
     dram_ = DRAM::Global();
+    ptlpp = TlppVerify::Global();
   }
 
   int Run(vta_phy_addr_t insn_phy_addr,
@@ -286,26 +288,37 @@ class Device {
     for (uint32_t i = 0; i < insn_count; ++i) {
       this->Run(insn + i);
     }
+    this->TlppSynchronization();
     return 0;
   }
 
  private:
-  void Run(const VTAGenericInsn* insn) {
+  static void Run_Insn(const VTAGenericInsn* insn, void * dev) {
+    Device * device = reinterpret_cast<Device *> (dev);
     const VTAMemInsn* mem = reinterpret_cast<const VTAMemInsn*>(insn);
     const VTAGemInsn* gem = reinterpret_cast<const VTAGemInsn*>(insn);
     const VTAAluInsn* alu = reinterpret_cast<const VTAAluInsn*>(insn);
     switch (mem->opcode) {
-      case VTA_OPCODE_LOAD: RunLoad(mem); break;
-      case VTA_OPCODE_STORE: RunStore(mem); break;
-      case VTA_OPCODE_GEMM: RunGEMM(gem); break;
-      case VTA_OPCODE_ALU: RunALU(alu); break;
-      case VTA_OPCODE_FINISH: ++finish_counter_; break;
+      case VTA_OPCODE_LOAD: device->RunLoad(mem); break;
+      case VTA_OPCODE_STORE: device->RunStore(mem); break;
+      case VTA_OPCODE_GEMM: device->RunGEMM(gem); break;
+      case VTA_OPCODE_ALU: device->RunALU(alu); break;
+      case VTA_OPCODE_FINISH: ++(device->finish_counter_); break;
       default: {
         LOG(FATAL) << "Unknown op_code" << mem->opcode;
       }
     }
   }
 
+ private:
+  void Run(const VTAGenericInsn* insn) {
+    ptlpp->TlppPushInsn(insn);
+  }
+
+  void TlppSynchronization(void) {
+    ptlpp->TlppSynchronization(Run_Insn, reinterpret_cast<void *> (this));
+  }
+
   void RunLoad(const VTAMemInsn* op) {
     if (op->x_size == 0) return;
     if (op->memory_type == VTA_MEM_ID_INP) {
@@ -466,6 +479,7 @@ class Device {
   Profiler* prof_;
   // The DRAM interface
   DRAM* dram_;
+  TlppVerify* ptlpp;
   // The SRAM
   SRAM<VTA_INP_WIDTH, VTA_BATCH * VTA_BLOCK_IN, VTA_INP_BUFF_DEPTH> inp_;
   SRAM<VTA_WGT_WIDTH, VTA_BLOCK_IN * VTA_BLOCK_OUT, VTA_WGT_BUFF_DEPTH> wgt_;
diff --git a/vta/src/sim/sim_tlpp.cc b/vta/src/sim/sim_tlpp.cc
new file mode 100644
index 000000000000..5a97b93b65dd
--- /dev/null
+++ b/vta/src/sim/sim_tlpp.cc
@@ -0,0 +1,214 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file sim_tlpp.cc
+ * \brief simulate core level pipe line parallism logic.
+ */
+#include <vta/sim_tlpp.h>
+TlppVerify::TlppVerify() {
+  done_ = 0;
+}
+
+void TlppVerify::Clear() {
+  fsim_handle_ = nullptr;
+  run_fsim_function_ = nullptr;
+  for (int i = 0; i < COREMAX; i++) {
+    while (insnq_array_[i].size()) {
+      insnq_array_[i].pop();
+    }
+  }
+  done_ = 0;
+}
+
+uint64_t TlppVerify::GetOperationCode(const VTAGenericInsn *insn) {
+  const VTAMemInsn* mem = reinterpret_cast<const VTAMemInsn*>(insn);
+  return mem->opcode;
+}
+
+CORE_TYPE TlppVerify::GetCoreType(uint64_t operation_code,
+                              const VTAGenericInsn *insn) {
+  CORE_TYPE core_type = COREGEMM;
+  const VTAMemInsn* mem = reinterpret_cast<const VTAMemInsn*>(insn);
+  switch (operation_code) {
+    case VTA_OPCODE_GEMM:
+    case VTA_OPCODE_ALU:
+      core_type = COREGEMM;
+      break;
+    case VTA_OPCODE_LOAD:
+      if (mem->memory_type == VTA_MEM_ID_INP||
+          mem->memory_type == VTA_MEM_ID_WGT) {
+        core_type = CORELOAD;
+      }
+      break;
+    case VTA_OPCODE_STORE:
+      core_type = CORESTORE;
+      break;
+    default:
+      break;
+  }
+  return core_type;
+}
+
+bool TlppVerify::DependencyProcess(bool before_run,
+    bool pop_prev, bool pop_next,
+    bool push_prev, bool push_next,
+    Dep_q_t *pop_prev_q, Dep_q_t *pop_next_q,
+    Dep_q_t *push_prev_q, Dep_q_t *push_next_q,
+    CORE_TYPE push_to_prev_q_indx, CORE_TYPE push_to_next_q_indx) {
+
+  int val = 1;
+  if (before_run) {
+    if (pop_prev && pop_prev_q->size() == 0) {
+      return false;
+    }
+    if (pop_next && pop_next_q->size() == 0) {
+      return false;
+    }
+    if (pop_next) pop_next_q->pop();
+    if (pop_prev) pop_prev_q->pop();
+  } else {
+    if (push_prev) {
+      push_prev_q->push(val);
+      dep_push_event_.push(push_to_prev_q_indx);
+    }
+    if (push_next) {
+      push_next_q->push(val);
+      dep_push_event_.push(push_to_next_q_indx);
+    }
+  }
+  return true;
+}
+
+bool TlppVerify::InsnDependencyCheck(const VTAGenericInsn *insn,
+                                     bool before_run) {
+  const VTAMemInsn* mem = reinterpret_cast<const VTAMemInsn*>(insn);
+  bool pop_prev = mem->pop_prev_dep;
+  bool pop_next = mem->pop_next_dep;
+  bool push_prev = mem->push_prev_dep;
+  bool push_next = mem->push_next_dep;
+  CORE_TYPE core_type = GetCoreType(GetOperationCode(insn), insn);
+  bool bcheck = false;
+  switch (core_type) {
+    case COREGEMM:
+      bcheck = DependencyProcess(before_run, pop_prev,
+          pop_next, push_prev, push_next,
+          &l2g_q_, &s2g_q_, &g2l_q_, &g2s_q_, CORELOAD, CORESTORE);
+      break;
+    case CORELOAD:
+      bcheck = DependencyProcess(before_run, pop_prev,
+          pop_next, push_prev, push_next,
+          nullptr, &g2l_q_, nullptr, &l2g_q_, COREMAX, COREGEMM);
+      break;
+    case CORESTORE:
+      bcheck = DependencyProcess(before_run, pop_prev,
+          pop_next, push_prev, push_next,
+          &g2s_q_, nullptr, &s2g_q_, nullptr, COREGEMM, COREMAX);
+      break;
+    case COREMAX:
+      assert(0);
+      break;
+  }
+
+  return bcheck;
+}
+
+void TlppVerify::CoreRun(CORE_TYPE core_type) {
+  const VTAGenericInsn *insn = PickFrontInsn(core_type);
+  while (insn) {
+    /*!
+     * Check need to read any dependency queue for wait.
+     */
+    if (!InsnDependencyCheck(insn, true)) {
+      break;
+    }
+    /*!
+     * Execute the instruction.
+     */
+    run_fsim_function_(insn, fsim_handle_);
+    /*!
+     *check if need to write any dependency queue for notify.
+     */
+    InsnDependencyCheck(insn, false);
+    /*!
+     * If instruction is FINISH set done flag.
+     * notification.
+     */
+    done_ = GetOperationCode(insn) == VTA_OPCODE_FINISH;
+
+    if (debug_) {
+      printf("this is thread for %s\n", GetCoreTypeName(core_type));
+    }
+    ConsumeFrontInsn(core_type);
+    insn = PickFrontInsn(core_type);
+  }
+  return;
+}
+
+void TlppVerify::EventProcess(void) {
+  while (dep_push_event_.size()) {
+      CORE_TYPE core_type = dep_push_event_.front();
+      dep_push_event_.pop();
+      CoreRun(core_type);
+  }
+}
+
+void TlppVerify::TlppSynchronization(Run_Function run_function,
+                                         void *fsim_handle,
+                                         bool debug) {
+  fsim_handle_ = fsim_handle;
+  run_fsim_function_ = run_function;
+  debug_ = debug;
+  done_ = 0;
+  do {
+    /*
+     * Pick a random core to run first.
+     */
+    unsigned int seed = time(NULL);
+    uint8_t core_start = rand_r(&seed)%COREMAX;
+    for (int i = 0; i < COREMAX; i++) {
+      CoreRun(static_cast<CORE_TYPE>((core_start + i) % COREMAX));
+    }
+    EventProcess();
+  }while (!done_);
+  Clear();
+  return;
+}
+
+void TlppVerify::TlppPushInsn(const VTAGenericInsn *insn) {
+  uint64_t operation_code = GetOperationCode(insn);
+  CORE_TYPE core_type = GetCoreType(operation_code, insn);
+  insnq_array_[core_type].push(static_cast<const void *>(insn));
+  return;
+}
+
+const VTAGenericInsn *TlppVerify::PickFrontInsn(uint64_t core_type) {
+  const void *return_value = nullptr;
+  if (insnq_array_[core_type].size()) {
+    return_value = insnq_array_[core_type].front();
+  }
+  return reinterpret_cast<const VTAGenericInsn *> (return_value);
+}
+
+void TlppVerify::ConsumeFrontInsn(uint64_t core_type) {
+  if (insnq_array_[core_type].size()) {
+    insnq_array_[core_type].pop();
+  }
+}