diff --git a/.gitignore b/.gitignore
index 9ce5ec485e6a..53954204fc9d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -92,6 +92,6 @@ ENV/
 *~
 build
 config.mk
-build_win
+build_*
 Win32
 *.dir
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9cd681bccfe4..3a1eb3ca1d39 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,14 +1,17 @@
 cmake_minimum_required(VERSION 3.5)
-project(tvm)
+project(tvm C CXX)
 
-include(cmake/Util.cmake)
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/build/private/local_config.cmake)
+  include(${CMAKE_CURRENT_SOURCE_DIR}/build/private/local_config.cmake)
+endif()
 
-option(USE_OPENCL "Build with OpenCL" OFF)
-option(USE_CUDA "Build with CUDA" OFF)
-option(USE_LLVM "Build with LLVM" OFF)
-option(USE_RTTI "Build with RTTI" OFF)
+include(cmake/Util.cmake)
+tvm_option(USE_CUDA "Build with CUDA" ON)
+tvm_option(USE_OPENCL "Build with OpenCL" ON)
+tvm_option(USE_LLVM "Build with LLVM" OFF)
+tvm_option(USE_RTTI "Build with RTTI" OFF)
+tvm_option(USE_MSVC_MT "Build with MT" OFF)
 
-# include path
 include_directories("include")
 include_directories("HalideIR/src")
 set(TVM_LINKER_LIBS "")
@@ -20,24 +23,22 @@ if(MSVC)
   add_definitions(-D_CRT_SECURE_NO_WARNINGS)
   add_definitions(-D_SCL_SECURE_NO_WARNINGS)
   add_definitions(-DTVM_EXPORTS)
-  foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
-    if(${flag_var} MATCHES "/MD")
-      string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-    endif(${flag_var} MATCHES "/MD")
-  endforeach(flag_var)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /bigobj")
+  if(USE_MSVC_MT)
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/MD")
+    endforeach(flag_var)
+  endif()
 else(MSVC)
   include(CheckCXXCompilerFlag)
   check_cxx_compiler_flag("-std=c++11"    SUPPORT_CXX11)
-  check_cxx_compiler_flag("-msse2"        SUPPORT_MSSE2)
   set(CMAKE_C_FLAGS "-O3 -fno-rtti -Wall -std=c++11 -fPIC")
-  if(SUPPORT_OPENMP)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp")
-  endif()
   set(CMAKE_CXX_FLAGS ${CMAKE_C_FLAGS})
 endif(MSVC)
 
@@ -49,6 +50,7 @@ tvm_source_group("Source\\arithmetic"   GLOB "src/arithmetic/*.cc")
 tvm_source_group("Source\\schedule"   GLOB "src/schedule/*.cc")
 tvm_source_group("Source\\codegen"   GLOB "src/codegen/*.cc")
 tvm_source_group("Source\\codegen\\llvm"   GLOB "src/codegen/llvm/*.cc")
+tvm_source_group("Source\\codegen\\stack_vm"   GLOB "src/codegen/stack_vm/*.cc")
 tvm_source_group("Source\\pass"   GLOB "src/pass/*.cc")
 tvm_source_group("Source\\runtime"   GLOB "src/runtime/*.cc")
 tvm_source_group("Source\\runtime\\cuda"   GLOB "src/runtime/cuda/*.cc")
@@ -58,7 +60,7 @@ file(GLOB COMPILER_SRCS
     src/api/*.cc
     src/arithmetic/*.cc
     src/codegen/*.cc
-    src/stack_vm/*.cc
+    src/codegen/stack_vm/*.cc
     src/lang/*.cc
     src/pass/*.cc
     src/schedule/*.cc
@@ -71,19 +73,44 @@ file(GLOB RUNTIME_CUDA_SRCS src/runtime/cuda/*.cc)
 file(GLOB RUNTIME_OPENCL_SRCS src/runtime/opencl/*.cc)
 
 if(USE_CUDA)
+find_package(CUDA)
+  find_package(CUDA QUIET REQUIRED)
+  message(STATUS "Build with CUDA support...")
+	include_directories(${CUDA_INCLUDE_DIRS})
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUDART_LIBRARY})
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUDA_LIBRARY})
   list(APPEND RUNTIME_SRCS ${RUNTIME_CUDA_SRCS})
+  if(MSVC)
+    find_library(CUDA_NVRTC_LIB nvrtc
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib/win32)
+    list(APPEND TVM_LINKER_LIBS ${CUDA_NVRTC_LIB})
+  endif()
+  add_definitions(-DTVM_CUDA_RUNTIME=1)
 else(USE_CUDA)
   add_definitions(-DTVM_CUDA_RUNTIME=0)
 endif(USE_CUDA)
 
 if(USE_OPENCL)
+  find_package(OPENCL QUIET REQUIRED)
+  message(STATUS "Build with OpenCL support...")
+  include_directories(${OPENCL_INCLUDE_DIRS})
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${OpenCL_LIBRARIES})
   list(APPEND RUNTIME_SRCS ${RUNTIME_OPENCL_SRCS})
+  add_definitions(-DTVM_OPENCL_RUNTIME=1)
 else(USE_OPENCL)
   add_definitions(-DTVM_OPENCL_RUNTIME=0)
 endif(USE_OPENCL)
 
 if(USE_LLVM)
-  add_definitions(-DTVM_LLVM_VERSION=40)
+  find_package(LLVM REQUIRED CONFIG)
+  message(STATUS "Build with LLVM support...")
+  include_directories(${LLVM_INCLUDE_DIRS})
+  add_definitions(${LLVM_DEFINITIONS})
+  llvm_map_components_to_libnames(LLVM_LIBS all)
+  list(REMOVE_ITEM LLVM_LIBS LTO)
+  list(APPEND TVM_LINKER_LIBS ${LLVM_LIBS})
+  add_definitions(-DTVM_LLVM_VERSION=${LLVM_PACKAGE_VERSION})
   list(APPEND COMPILER_SRCS ${COMPILER_LLVM_SRCS})
 endif(USE_LLVM)
 
@@ -109,9 +136,7 @@ else()
     set(CMAKE_SHARED_LIBRARY_PREFIX "")
 endif()
 
-
 add_library(libtvm SHARED ${COMPILER_SRCS} ${RUNTIME_SRCS})
 add_library(libtvm_runtime SHARED ${RUNTIME_SRCS})
-
-target_link_libraries(libtvm ${TVM_LINKER_LIBS})
+target_link_libraries(libtvm ${TVM_LINKER_LIBS} ${TVM_RUNTIME_LINKER_LIBS})
 target_link_libraries(libtvm_runtime  ${TVM_RUNTIME_LINKER_LIBS})
diff --git a/cmake/Util.cmake b/cmake/Util.cmake
index 191145a49499..b7c228cf3520 100644
--- a/cmake/Util.cmake
+++ b/cmake/Util.cmake
@@ -12,3 +12,44 @@ function(tvm_source_group group)
     source_group(${group} FILES ${srcs2})
   endif()
 endfunction()
+
+#######################################################
+# An option that the user can select. Can accept condition to control when option is available for user.
+# Usage:
+#   tvm_option(<option_variable> "doc string" <initial value or boolean expression> [IF <condition>])
+function(tvm_option variable description value)
+  set(__value ${value})
+  set(__condition "")
+  set(__varname "__value")
+  foreach(arg ${ARGN})
+    if(arg STREQUAL "IF" OR arg STREQUAL "if")
+      set(__varname "__condition")
+    else()
+      list(APPEND ${__varname} ${arg})
+    endif()
+  endforeach()
+  unset(__varname)
+  if("${__condition}" STREQUAL "")
+    set(__condition 2 GREATER 1)
+  endif()
+
+  if(${__condition})
+    if("${__value}" MATCHES ";")
+      if(${__value})
+        option(${variable} "${description}" ON)
+      else()
+        option(${variable} "${description}" OFF)
+      endif()
+    elseif(DEFINED ${__value})
+      if(${__value})
+        option(${variable} "${description}" ON)
+      else()
+        option(${variable} "${description}" OFF)
+      endif()
+    else()
+      option(${variable} "${description}" ${__value})
+    endif()
+  else()
+    unset(${variable} CACHE)
+  endif()
+endfunction()
\ No newline at end of file
diff --git a/python/tvm/addon/nvcc_compiler.py b/python/tvm/addon/nvcc_compiler.py
index 81465faff387..8c748bc7c2cc 100644
--- a/python/tvm/addon/nvcc_compiler.py
+++ b/python/tvm/addon/nvcc_compiler.py
@@ -46,7 +46,8 @@ def compile_source(code, target="ptx", arch=None,
     file_target = path_target if path_target else temp_target
     cmd = ["nvcc"]
     cmd += ["--%s" % target, "-O3"]
-    cmd += ["-arch", arch]
+    if arch:
+        cmd += ["-arch", arch]
     cmd += ["-o", file_target]
 
     if options:
diff --git a/python/tvm/libinfo.py b/python/tvm/libinfo.py
index 967f3e9c77ff..cbf25a083d56 100644
--- a/python/tvm/libinfo.py
+++ b/python/tvm/libinfo.py
@@ -44,7 +44,7 @@ def find_lib_path():
         raise RuntimeError('Cannot find the files.\n' +
                            'List of candidates:\n' + str('\n'.join(dll_path)))
     if use_runtime:
-        sys.stderr.write("Loading runtime library... this is execution only\n")
+        sys.stderr.write("Loading runtime library %s... exec only\n" % lib_found[0])
         sys.stderr.flush()
     return lib_found
 
diff --git a/python/tvm/module.py b/python/tvm/module.py
index a29067c1266b..2de93eb8859b 100644
--- a/python/tvm/module.py
+++ b/python/tvm/module.py
@@ -32,7 +32,7 @@ def imported_modules(self):
         modules : list of Modules
             The module
         """
-        nmod = ImportsSize(self)
+        nmod = _ImportsSize(self)
         return [_GetImport(self, i) for i in range(nmod)]
 
     def save(self, file_name, fmt=""):
diff --git a/src/codegen/llvm/llvm_common.h b/src/codegen/llvm/llvm_common.h
index 7a24799974a7..f0ee5ed589b2 100644
--- a/src/codegen/llvm/llvm_common.h
+++ b/src/codegen/llvm/llvm_common.h
@@ -39,7 +39,6 @@
 #include <utility>
 #include <string>
 
-
 namespace tvm {
 namespace codegen {
 
diff --git a/src/codegen/llvm/llvm_module.cc b/src/codegen/llvm/llvm_module.cc
index 74abbd7318cd..0584ec6a0d40 100644
--- a/src/codegen/llvm/llvm_module.cc
+++ b/src/codegen/llvm/llvm_module.cc
@@ -6,6 +6,7 @@
 #ifdef TVM_LLVM_VERSION
 #include <tvm/runtime/packed_func.h>
 #include <tvm/codegen.h>
+#include <mutex>
 #include "./llvm_common.h"
 #include "./codegen_llvm.h"
 #include "../../runtime/file_util.h"
diff --git a/src/codegen/stack_vm/codegen_stack_vm.cc b/src/codegen/stack_vm/codegen_stack_vm.cc
index e905d7c192d7..7512b6eea961 100644
--- a/src/codegen/stack_vm/codegen_stack_vm.cc
+++ b/src/codegen/stack_vm/codegen_stack_vm.cc
@@ -417,7 +417,7 @@ TVM_STATIC_IR_FUNCTOR(CodeGenStackVM, vtable)
 .set_dispatch<LetStmt>([](const LetStmt *op, CodeGenStackVM* p) {
     p->Push(op->value);
     int64_t vid = p->AllocVarID(op->var.get());
-    p->PushOp(StackVM::STORE_HEAP, vid);
+    p->PushOp(StackVM::STORE_HEAP, static_cast<int>(vid));
     p->Push(op->body);
   })
 .set_dispatch<Ramp>([](const Ramp *op, CodeGenStackVM* p) {
@@ -445,7 +445,7 @@ TVM_STATIC_IR_FUNCTOR(CodeGenStackVM, vtable)
 .set_dispatch<Let>([](const Let *op, CodeGenStackVM* p) {
     p->Push(op->value);
     int64_t vid = p->AllocVarID(op->var.get());
-    p->PushOp(StackVM::STORE_HEAP, vid);
+    p->PushOp(StackVM::STORE_HEAP, static_cast<int>(vid));
     p->Push(op->body);
   })
 .set_dispatch<Load>([](const Load *op, CodeGenStackVM* p) {
diff --git a/src/lang/operation.cc b/src/lang/operation.cc
index ac1e9541744d..ceb457aadbd1 100644
--- a/src/lang/operation.cc
+++ b/src/lang/operation.cc
@@ -125,7 +125,7 @@ inline bool prove_equal(Expr lhs, Expr rhs) {
 }
 
 int ScanOpNode::num_outputs() const {
-  return update.size();
+  return static_cast<int>(update.size());
 }
 Array<IterVar> ScanOpNode::root_iter_vars() const {
   return Array<IterVar>{scan_axis};
diff --git a/src/pass/make_api.cc b/src/pass/make_api.cc
index e4a2e76267bf..4445df46a507 100644
--- a/src/pass/make_api.cc
+++ b/src/pass/make_api.cc
@@ -103,7 +103,7 @@ LoweredFunc MakeAPI(Stmt body,
         MakeAssertEQ(v_num_packed_args, num_packed_args, os.str()));
   }
 
-  for (size_t i = 0; i < api_args.size(); ++i) {
+  for (int i = 0; i < static_cast<int>(api_args.size()); ++i) {
     Var v_arg = f_arg_decl(i);
     if (i < static_cast<size_t>(num_packed_args)) {
       seq_init.emplace_back(LetStmt::make(
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index 837e884dd6e3..4776bf85dc9a 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -89,7 +89,7 @@ struct TVMRuntimeEntry {
     if (val != nullptr) {
       num_par_threads = atoi(val);
     } else {
-      num_par_threads = std::thread::hardware_concurrency();
+      num_par_threads = std::thread::hardware_concurrency() / 2;
     }
   }
 };
@@ -127,7 +127,7 @@ int TVMModGetFunction(TVMModuleHandle mod,
                       TVMFunctionHandle *func) {
   API_BEGIN();
   PackedFunc pf = static_cast<Module*>(mod)->GetFunction(
-      func_name, query_imports);
+      func_name, query_imports != 0);
   if (pf != nullptr) {
     *func = new PackedFunc(pf);
   } else {
diff --git a/src/runtime/cuda/cuda_module.cc b/src/runtime/cuda/cuda_module.cc
index 12bc1ca7fd40..113b73136d64 100644
--- a/src/runtime/cuda/cuda_module.cc
+++ b/src/runtime/cuda/cuda_module.cc
@@ -39,7 +39,7 @@ class CUDAModuleNode : public runtime::ModuleNode {
   ~CUDAModuleNode() {
     for (size_t i = 0; i < module_.size(); ++i) {
       if (module_[i] != nullptr) {
-        CUDA_CALL(cudaSetDevice(i));
+        CUDA_CALL(cudaSetDevice(static_cast<int>(i)));
         CUDA_DRIVER_CALL(cuModuleUnload(module_[i]));
       }
     }
diff --git a/src/runtime/dso_module.cc b/src/runtime/dso_module.cc
index 502f8216b16f..eb015a70c228 100644
--- a/src/runtime/dso_module.cc
+++ b/src/runtime/dso_module.cc
@@ -75,11 +75,13 @@ class DSOModuleNode : public ModuleNode {
   HMODULE lib_handle_{nullptr};
   // Load the library
   void Load(const std::string& name) {
-    lib_handle_ = LoadLibrary(name.c_str());
+    // use wstring version that is needed by LLVM.
+    std::wstring wname(name.begin(), name.end());
+    lib_handle_ = LoadLibraryW(wname.c_str());
   }
   BackendPackedCFunc GetFuncPtr(const std::string& name) {
     return reinterpret_cast<BackendPackedCFunc>(
-        GetProcAddress(lib_handle_, name.c_str()));  // NOLINT(*)
+        GetProcAddress(lib_handle_, (LPCSTR)name.c_str()));  // NOLINT(*)
   }
   void* GetGlobalVPtr(const std::string& name) {
     return reinterpret_cast<void*>(
diff --git a/src/runtime/module.cc b/src/runtime/module.cc
index 0b630c1fa870..112dca59f50d 100644
--- a/src/runtime/module.cc
+++ b/src/runtime/module.cc
@@ -119,9 +119,9 @@ TVM_REGISTER_GLOBAL(_module__GetImport)
         imports().at(args[1].operator int());
     });
 
-TVM_REGISTER_GLOBAL(_module__GetTyeKey)
+TVM_REGISTER_GLOBAL(_module__GetTypeKey)
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = args[0].operator Module()->type_key();
+    *ret = std::string(args[0].operator Module()->type_key());
     });
 
 TVM_REGISTER_GLOBAL(_module__LoadFromFile)
diff --git a/src/schedule/bound.cc b/src/schedule/bound.cc
index c2fa061bde7c..e82e449db4d3 100644
--- a/src/schedule/bound.cc
+++ b/src/schedule/bound.cc
@@ -389,7 +389,7 @@ void InferRootBound(const Stage& stage,
   bool direct_consume_by_parent = false;
   for (int i = 0; i < stage->op->num_outputs(); ++i) {
     Tensor t = stage->op.output(i);
-    tmap.emplace(t, TensorDom(t.ndim()));
+    tmap.emplace(t, TensorDom(static_cast<int>(t.ndim())));
     auto it = feed_graph.find(t);
     if (it != feed_graph.end()) {
       for (const Operation& op : it->second) {
diff --git a/src/schedule/graph.cc b/src/schedule/graph.cc
index 539892d20add..fb30f998f694 100644
--- a/src/schedule/graph.cc
+++ b/src/schedule/graph.cc
@@ -22,6 +22,9 @@ struct TensorDimKey {
   TensorDimKey(const Tensor& t, int dim)
       : f(t->op), value_index(t->value_index), dim(dim) {
   }
+  TensorDimKey(const Tensor& t, size_t dim)
+      : f(t->op), value_index(t->value_index), dim(static_cast<int>(dim)) {
+  }
   inline bool operator==(const TensorDimKey& other) const {
     return f == other.f &&
         value_index == other.value_index &&
@@ -183,7 +186,7 @@ ReachGraph GetReachGraph(const Array<Operation>& ops) {
       const auto& init = op.as<ScanOpNode>()->init;
       for (size_t i = 0; i < update.size(); ++i) {
         Tensor t = op.output(i);
-        for (size_t k = 1; k < update[i]->shape.size(); ++k) {
+        for (int k = 1; k < static_cast<int>(update[i]->shape.size()); ++k) {
           reach[TensorDimKey(t, k)].emplace_back(
               TensorDimKey(update[i], k));
           reach[TensorDimKey(t, k)].emplace_back(
@@ -203,7 +206,7 @@ ReachGraph GetReachGraph(const Array<Operation>& ops) {
         if (call != nullptr && call->func.defined()) {
           if (!bset.count(call->func.get())) return;
           for (size_t i = 0; i < call->args.size(); ++i) {
-            TensorDimKey dkey(call, i);
+            TensorDimKey dkey(call, static_cast<int>(i));
             auto fpush = [&dkey, &vmap, &reach](const NodeRef& node) {
               const Variable *v = node.as<Variable>();
               auto it = vmap.find(v);
@@ -319,7 +322,7 @@ Map<IterVar, Expr> ScanFixPointAnalysis(
         if (call != nullptr && call->func.defined()) {
           for (size_t i = 0; i < call->args.size(); ++i) {
             auto it = vmap.find(call->args[i].get());
-            TensorDimKey src(call, i);
+            TensorDimKey src(call, static_cast<int>(i));
             if (it != vmap.end()) {
               f_merge_key(it->second, src);
             } else {
diff --git a/src/schedule/schedule_lang.cc b/src/schedule/schedule_lang.cc
index c384d465f69c..5bb6d0048af6 100644
--- a/src/schedule/schedule_lang.cc
+++ b/src/schedule/schedule_lang.cc
@@ -264,7 +264,7 @@ Schedule::Schedule(Array<Operation> ops) {
   }
   for (Operation op : post_order) {
     Stage stage(op);
-    stage->is_output = output_set.count(op);
+    stage->is_output = output_set.count(op) != 0;
     n->stages.push_back(stage);
     n->stage_map.Set(op, stage);
     // mark scan updates.
diff --git a/tests/python/integration/test_ewise.py b/tests/python/integration/test_ewise.py
index 75798f0c4c2a..950602487e44 100644
--- a/tests/python/integration/test_ewise.py
+++ b/tests/python/integration/test_ewise.py
@@ -21,8 +21,10 @@ def test_add():
     # one line to build the function.
     def check_device(device, host="stackvm"):
         if not tvm.codegen.enabled(host):
+            print("skip because %s is not enabled.." % host)
             return
         if not tvm.codegen.enabled(device):
+            print("skip because %s is not enabled.." % device)
             return
         fadd = tvm.build(s, [A, B, C],
                          device, host,