From ac474dace4ec63ccd77d3d76206ca3810842d5b4 Mon Sep 17 00:00:00 2001
From: Won-Kyu Park <wkpark@gmail.com>
Date: Sat, 2 Dec 2023 03:44:27 +0900
Subject: [PATCH 1/8] based on Windows support PR #2465 by @andreigh

 * based on https://github.com/openai/triton/pull/2465
 * manually applied, rebased, fix lint errors
 * use set_target_properties(), cleanup for windows
 * remove '/A' platform option to use windows ninja
 * remove unknown option '/m'
 * use sysconfig.get_config_var() to get the path of python*.lib
 * clang fix for windows
 * remove '-fPIC' for windows clang
 * fix download_and_copy() to support windows
 * add "exe" extension for windows
 * use "pyd" extension for windows to make importlib work

Original-author-by: Andrei Gheorghe <andrei@dharmaventures.co>
Signed-off-by: Won-Kyu Park <wkpark@gmail.com>
---
 .gitignore                                    |  1 +
 CMakeLists.txt                                | 50 ++++++++++----
 bin/CMakeLists.txt                            |  1 +
 lib/Conversion/TritonGPUToLLVM/CMakeLists.txt |  1 +
 python/setup.py                               | 68 +++++++++++--------
 python/triton/common/backend.py               |  9 ++-
 python/triton/common/build.py                 | 33 +++++++--
 python/triton/compiler/backends/cuda.py       |  3 +
 python/triton/compiler/make_launcher.py       |  2 +-
 python/triton/runtime/driver.py               |  2 +-
 10 files changed, 116 insertions(+), 54 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0180cd911245..05f922a11698 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@ python/build/
 python/triton.egg-info/
 python/triton/_C/libtriton.pyd
 python/triton/_C/libtriton.so
+python/triton/_C/triton.dll
 
 # Python caches
 __pycache__/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8488174c5763..6abce0cb4511 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,8 +28,17 @@ set(TRITON_CODEGEN_BACKENDS "" CACHE STRING "Enable different codegen backends")
 # used conditionally in this file and by lit tests
 
 # Customized release build type with assertions: TritonRelBuildWithAsserts
-set(CMAKE_C_FLAGS_TRITONRELBUILDWITHASSERTS "-O2 -g")
-set(CMAKE_CXX_FLAGS_TRITONRELBUILDWITHASSERTS "-O2 -g")
+if(NOT MSVC)
+  set(CMAKE_C_FLAGS_TRITONRELBUILDWITHASSERTS "-O2 -g")
+  set(CMAKE_CXX_FLAGS_TRITONRELBUILDWITHASSERTS "-O2 -g")
+else()
+  set(CMAKE_C_FLAGS_TRITONRELBUILDWITHASSERTS "/Zi /Ob0 /Od /RTC1")
+  set(CMAKE_CXX_FLAGS_TRITONRELBUILDWITHASSERTS "/Zi /Ob0 /Od /RTC1")
+  set(CMAKE_EXE_LINKER_FLAGS_TRITONRELBUILDWITHASSERTS "/debug:fastlink /INCREMENTAL")
+  set(CMAKE_MODULE_LINKER_FLAGS_TRITONRELBUILDWITHASSERTS "/debug:fastlink /INCREMENTAL")
+  set(CMAKE_SHARED_LINKER_FLAGS_TRITONRELBUILDWITHASSERTS "/debug:fastlink /INCREMENTAL")
+  set(CMAKE_STATIC_LINKER_FLAGS_TRITONRELBUILDWITHASSERTS "/debug:fastlink /INCREMENTAL")
+endif()
 
 # Default build type
 if(NOT CMAKE_BUILD_TYPE)
@@ -47,7 +56,15 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
 # Third-party
 include_directories(${PYBIND11_INCLUDE_DIR})
 
-set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -D__STDC_FORMAT_MACROS  -fPIC -std=gnu++17 -fvisibility=hidden -fvisibility-inlines-hidden")
+if(NOT MSVC)
+  if(NOT WIN32)
+    set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -D__STDC_FORMAT_MACROS  -fPIC -std=gnu++17 -fvisibility=hidden -fvisibility-inlines-hidden")
+  else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -D__STDC_FORMAT_MACROS -std=gnu++17 -Wno-deprecated  -fvisibility=hidden -fvisibility-inlines-hidden")
+  endif()
+else()
+  set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -D__STDC_FORMAT_MACROS  /wd4244 /wd4624 /wd4715 /wd4530")
+endif()
 
 if(APPLE)
   set(CMAKE_OSX_DEPLOYMENT_TARGET 11.6)
@@ -59,7 +76,7 @@ endif()
 if(NOT MLIR_DIR)
   if(NOT LLVM_LIBRARY_DIR)
     if(WIN32)
-      find_package(LLVM 13 REQUIRED COMPONENTS nvptx amdgpu)
+      find_package(LLVM 17 REQUIRED COMPONENTS nvptx amdgpu)
 
       include_directories(${LLVM_INCLUDE_DIRS})
       separate_arguments(LLVM_DEFINITIONS_LIST NATIVE_COMMAND ${LLVM_DEFINITIONS})
@@ -154,6 +171,8 @@ if(TRITON_BUILD_PYTHON_MODULE)
 
   if(PYTHON_INCLUDE_DIRS)
     include_directories(${PYTHON_INCLUDE_DIRS})
+    message(STATUS "PYTHON_LIB_DIRS ${PYTHON_LIB_DIRS}")
+    link_directories(${PYTHON_LIB_DIRS})
   else()
     find_package(Python3 REQUIRED COMPONENTS Development Interpreter)
     include_directories(${Python3_INCLUDE_DIRS})
@@ -163,16 +182,6 @@ if(TRITON_BUILD_PYTHON_MODULE)
   endif()
 endif()
 
-# # Triton
-# file(GLOB_RECURSE LIBTRITON_SRC lib/*.cc)
-# if (WIN32 AND TRITON_BUILD_PYTHON_MODULE)
-# Python3_add_library(triton SHARED ${LIBTRITON_SRC} ${PYTHON_SRC})
-# set_target_properties(triton PROPERTIES SUFFIX ".pyd")
-# set_target_properties(triton PROPERTIES PREFIX "lib")
-# else()
-# add_library(triton SHARED ${LIBTRITON_SRC} ${PYTHON_SRC})
-# endif()
-
 # MLIR
 find_package(MLIR REQUIRED CONFIG PATHS ${MLIR_DIR})
 
@@ -184,7 +193,11 @@ include(AddLLVM)
 include(AddMLIR)
 
 # Disable warnings that show up in external code (gtest;pybind11)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wno-covered-switch-default")
+if(NOT MSVC)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wno-covered-switch-default")
+else()
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /WX-")
+endif()
 
 include_directories(${MLIR_INCLUDE_DIRS})
 include_directories(${LLVM_INCLUDE_DIRS})
@@ -239,6 +252,8 @@ if(TRITON_BUILD_PYTHON_MODULE)
     target_link_libraries(triton PRIVATE ${LLVM_LIBRARIES} ${CMAKE_DL_LIBS}
       ${TRITON_LIBRARIES}
     )
+    set_target_properties(triton PROPERTIES SUFFIX ".pyd")
+    set_target_properties(triton PROPERTIES PREFIX "lib")
   elseif(APPLE)
     target_link_libraries(triton ${LLVM_LIBRARIES} z
       ${TRITON_LIBRARIES}
@@ -275,6 +290,11 @@ if (${CODEGEN_BACKENDS_LEN} GREATER 0)
   endforeach()
 endif()
 
+if(WIN32)
+  option(CMAKE_USE_WIN32_THREADS_INIT "using WIN32 threads" ON)
+  option(gtest_disable_pthreads "Disable uses of pthreads in gtest." ON)
+endif()
+
 add_subdirectory(test)
 
 add_subdirectory(unittest)
diff --git a/bin/CMakeLists.txt b/bin/CMakeLists.txt
index 41cb4d6b7ecb..7b1f7552b268 100644
--- a/bin/CMakeLists.txt
+++ b/bin/CMakeLists.txt
@@ -80,6 +80,7 @@ llvm_update_compile_flags(triton-translate)
 mlir_check_all_link_libraries(triton-translate)
 
 add_llvm_executable(triton-llvm-opt
+  PARTIAL_SOURCES_INTENDED
   triton-llvm-opt.cpp
 
   DEPENDS
diff --git a/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt b/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt
index 590bc6b99da0..c4d336366e80 100644
--- a/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt
+++ b/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt
@@ -56,6 +56,7 @@ add_mlir_conversion_library(TritonGPUToLLVM
 )
 
 add_mlir_library(ASMBuilder
+  PARTIAL_SOURCES_INTENDED
   GCNAsmFormat.cpp
   PTXAsmFormat.cpp
 
diff --git a/python/setup.py b/python/setup.py
index 32eb9eb8b24a..180ce68f7676 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -115,7 +115,7 @@ def get_thirdparty_packages(triton_cache_path):
         if p.syspath_var_name in os.environ:
             package_dir = os.environ[p.syspath_var_name]
         version_file_path = os.path.join(package_dir, "version.txt")
-        if p.syspath_var_name not in os.environ and\
+        if p.syspath_var_name not in os.environ and p.url and\
            (not os.path.exists(version_file_path) or Path(version_file_path).read_text() != p.url):
             try:
                 shutil.rmtree(package_root_dir)
@@ -128,6 +128,9 @@ def get_thirdparty_packages(triton_cache_path):
             # write version url to package_dir
             with open(os.path.join(package_dir, "version.txt"), "w") as f:
                 f.write(p.url)
+        elif p.syspath_var_name not in os.environ and not p.url:
+            raise RuntimeError(
+                f'{p.syspath_var_name} not set ! Please install {p.package} manually and set {p.syspath_var_name}.')
         if p.include_flag:
             thirdparty_cmake_args.append(f"-D{p.include_flag}={package_dir}/include")
         if p.lib_flag:
@@ -143,15 +146,18 @@ def download_and_copy(src_path, variable, version, url_func):
         return
     base_dir = os.path.dirname(__file__)
     arch = platform.machine()
-    if arch == "x86_64":
+    if arch in ["x86_64", "AMD64"]:
         arch = "64"
-    url = url_func(arch, version)
+    supported = {"Linux": "linux", "Windows": "win"}
+    is_supported = platform.system() in supported
+    if is_supported:
+        url = url_func(supported[platform.system()], arch, version)
     dst_prefix = os.path.join(base_dir, "triton")
     dst_suffix = os.path.join("third_party", "cuda", src_path)
     dst_path = os.path.join(dst_prefix, dst_suffix)
-    is_linux = platform.system() == "Linux"
+    dst_path += ".exe" if os.name == "nt" else ""
     download = False
-    if is_linux:
+    if is_supported:
         download = True
         if os.path.exists(dst_path):
             curr_version = subprocess.check_output([dst_path, "--version"]).decode("utf-8").strip()
@@ -163,6 +169,7 @@ def download_and_copy(src_path, variable, version, url_func):
         with tempfile.TemporaryDirectory() as temp_dir:
             file.extractall(path=temp_dir)
             src_path = os.path.join(temp_dir, src_path)
+            src_path += ".exe" if os.name == "nt" else ""
             os.makedirs(os.path.split(dst_path)[0], exist_ok=True)
             shutil.copy(src_path, dst_path)
 
@@ -262,6 +269,10 @@ def build_extension(self, ext):
             "-DCMAKE_VERBOSE_MAKEFILE:BOOL=ON",
             "-DPYTHON_INCLUDE_DIRS=" + python_include_dir,
         ]
+        if platform.system() == "Windows":
+            installed_base = sysconfig.get_config_var('installed_base')
+            py_lib_dirs = os.getenv("PYTHON_LIB_DIRS", os.path.join(installed_base, "libs"))
+            cmake_args.append("-DPYTHON_LIB_DIRS=" + py_lib_dirs)
         if lit_dir is not None:
             cmake_args.append("-DLLVM_EXTERNAL_LIT=" + lit_dir)
         cmake_args.extend(thirdparty_cmake_args)
@@ -276,10 +287,8 @@ def build_extension(self, ext):
             cmake_args += ["-DTRITON_CODEGEN_BACKENDS=" + all_codegen_backends]
 
         if platform.system() == "Windows":
+            cmake_args += ["-DCMAKE_BUILD_TYPE=" + cfg]
             cmake_args += [f"-DCMAKE_RUNTIME_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}"]
-            if sys.maxsize > 2**32:
-                cmake_args += ["-A", "x64"]
-            build_args += ["--", "/m"]
         else:
             cmake_args += ["-DCMAKE_BUILD_TYPE=" + cfg]
             max_jobs = os.getenv("MAX_JOBS", str(2 * os.cpu_count()))
@@ -321,27 +330,28 @@ def build_extension(self, ext):
         subprocess.check_call(["cmake", "--build", ".", "--target", "mlir-doc"], cwd=cmake_dir)
 
 
-download_and_copy(
-    src_path="bin/ptxas",
-    variable="TRITON_PTXAS_PATH",
-    version="12.3.52",
-    url_func=lambda arch, version:
-    f"https://anaconda.org/nvidia/cuda-nvcc/12.3.52/download/linux-{arch}/cuda-nvcc-{version}-0.tar.bz2",
-)
-download_and_copy(
-    src_path="bin/cuobjdump",
-    variable="TRITON_CUOBJDUMP_PATH",
-    version="12.3.52",
-    url_func=lambda arch, version:
-    f"https://anaconda.org/nvidia/cuda-cuobjdump/12.3.52/download/linux-{arch}/cuda-cuobjdump-{version}-0.tar.bz2",
-)
-download_and_copy(
-    src_path="bin/nvdisasm",
-    variable="TRITON_NVDISASM_PATH",
-    version="12.3.52",
-    url_func=lambda arch, version:
-    f"https://anaconda.org/nvidia/cuda-nvdisasm/12.3.52/download/linux-{arch}/cuda-nvdisasm-{version}-0.tar.bz2",
-)
+if platform.system() in ["Linux", "Windows"]:
+    download_and_copy(
+        src_path="bin/ptxas",
+        variable="TRITON_PTXAS_PATH",
+        version="12.3.52",
+        url_func=lambda system, arch, version:
+        f"https://anaconda.org/nvidia/cuda-nvcc/{version}/download/{system}-{arch}/cuda-nvcc-{version}-0.tar.bz2",
+    )
+    download_and_copy(
+        src_path="bin/cuobjdump",
+        variable="TRITON_CUOBJDUMP_PATH",
+        version="12.3.52",
+        url_func=lambda system, arch, version:
+        f"https://anaconda.org/nvidia/cuda-cuobjdump/{version}/download/{system}-{arch}/cuda-cuobjdump-{version}-0.tar.bz2",
+    )
+    download_and_copy(
+        src_path="bin/nvdisasm",
+        variable="TRITON_NVDISASM_PATH",
+        version="12.3.52",
+        url_func=lambda system, arch, version:
+        f"https://anaconda.org/nvidia/cuda-nvdisasm/{version}/download/{system}-{arch}/cuda-nvdisasm-{version}-0.tar.bz2",
+    )
 
 setup(
     name=os.environ.get("TRITON_WHEEL_NAME", "triton"),
diff --git a/python/triton/common/backend.py b/python/triton/common/backend.py
index f56cc7c9b1b2..e67fdb2f7b1b 100644
--- a/python/triton/common/backend.py
+++ b/python/triton/common/backend.py
@@ -106,6 +106,7 @@ def get_backend(device_type: str):
 
 
 def _path_to_binary(binary: str):
+    binary += ".exe" if os.name == "nt" else ""
     base_dir = os.path.join(os.path.dirname(__file__), os.pardir)
     paths = [
         os.environ.get(f"TRITON_{binary.upper()}_PATH", ""),
@@ -113,7 +114,10 @@ def _path_to_binary(binary: str):
     ]
 
     for p in paths:
-        bin = p.split(" ")[0]
+        if os.name != "nt":
+            bin = p.split(" ")[0]
+        else:
+            bin = p
         if os.path.exists(bin) and os.path.isfile(bin):
             result = subprocess.check_output([bin, "--version"], stderr=subprocess.STDOUT)
             if result is not None:
@@ -152,7 +156,8 @@ def compute_core_version_key():
             contents += [hashlib.sha1(f.read()).hexdigest()]
     # backend
     libtriton_hash = hashlib.sha1()
-    with open(os.path.join(TRITON_PATH, "_C/libtriton.so"), "rb") as f:
+    ext = "so" if os.name != "nt" else "pyd"
+    with open(os.path.join(TRITON_PATH, "_C", "libtriton." + ext), "rb") as f:
         while True:
             chunk = f.read(1024**2)
             if not chunk:
diff --git a/python/triton/common/build.py b/python/triton/common/build.py
index bd8395d4af2d..c39564899c69 100644
--- a/python/triton/common/build.py
+++ b/python/triton/common/build.py
@@ -22,6 +22,9 @@ def libcuda_dirs():
     if env_libcuda_path:
         return [env_libcuda_path]
 
+    if os.name == "nt":
+        return [os.environ.get("CUDA_PATH") + "\\lib\\x64"]
+
     libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode()
     # each line looks like the following:
     # libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1
@@ -60,6 +63,24 @@ def cuda_include_dir():
     return os.path.join(cuda_path, "include")
 
 
+def _cc_cmd(cc, src, out, include_dirs, library_dirs):
+    if cc == "cl":
+        cc_cmd = [cc, src, "/nologo", "/O2", "/LD"]
+        cc_cmd += [f"/I{dir}" for dir in include_dirs]
+        cc_cmd += ["/link"]
+        cc_cmd += [f"/LIBPATH:{dir}" for dir in library_dirs]
+        cc_cmd += ["cuda.lib", f"/OUT:{out}"]
+    else:
+        cc_cmd = [cc, src, "-O3", "-shared", "-fPIC"]
+        cc_cmd += [f"-I{dir}" for dir in include_dirs]
+        cc_cmd += [f"-L{dir}" for dir in library_dirs]
+        cc_cmd += ["-lcuda", "-o", out]
+
+        if os.name == "nt": cc_cmd.pop(cc_cmd.index("-fPIC"))
+
+    return cc_cmd
+
+
 def _build(name, src, srcdir):
     if is_hip():
         hip_lib_dir = os.path.join(rocm_path_dir(), "lib")
@@ -88,6 +109,10 @@ def _build(name, src, srcdir):
     if scheme == 'posix_local':
         scheme = 'posix_prefix'
     py_include_dir = sysconfig.get_paths(scheme=scheme)["include"]
+    py_lib_dirs = []
+    if os.name == "nt":
+        installed_base = sysconfig.get_config_var('installed_base')
+        py_lib_dirs = [os.getenv("PYTHON_LIB_DIRS", os.path.join(installed_base, "libs"))]
 
     if is_hip():
         ret = subprocess.check_call([
@@ -95,18 +120,14 @@ def _build(name, src, srcdir):
             f"-L{hip_lib_dir}", "-lamdhip64", "-o", so
         ])
     else:
-        cc_cmd = [
-            cc, src, "-O3", f"-I{cu_include_dir}", f"-I{py_include_dir}", f"-I{srcdir}", "-shared", "-fPIC", "-lcuda",
-            "-o", so
-        ]
-        cc_cmd += [f"-L{dir}" for dir in cuda_lib_dirs]
+        cc_cmd = _cc_cmd(cc, src, so, [cu_include_dir, py_include_dir, srcdir], [*cuda_lib_dirs, *py_lib_dirs])
         ret = subprocess.check_call(cc_cmd)
 
     if ret == 0:
         return so
     # fallback on setuptools
     extra_compile_args = []
-    library_dirs = cuda_lib_dirs
+    library_dirs = [*cuda_lib_dirs, *py_lib_dirs]
     include_dirs = [srcdir, cu_include_dir]
     libraries = ['cuda']
     # extra arguments
diff --git a/python/triton/compiler/backends/cuda.py b/python/triton/compiler/backends/cuda.py
index a019539ad812..49928ce49029 100644
--- a/python/triton/compiler/backends/cuda.py
+++ b/python/triton/compiler/backends/cuda.py
@@ -8,6 +8,7 @@
 from ..utils import get_ids_of_tensormaps, parse_tma_info
 from ..make_launcher import make_stub
 import hashlib
+import os
 
 
 def get_kernel_name(src: str, pattern: str) -> str:
@@ -196,6 +197,8 @@ def make_ptx(src, metadata, opt, capability):
     def make_cubin(src, metadata, opt, capability):
         metadata["name"] = get_kernel_name(src, pattern='// .globl')
         ptxas, _ = path_to_ptxas()
+        if os.name == 'nt':
+            ptxas = f'"{ptxas}"'
         return compile_ptx_to_cubin(src, ptxas, capability, opt.enable_fp_fusion)
 
     def add_stages(self, stages, compiler_options, linker_options):
diff --git a/python/triton/compiler/make_launcher.py b/python/triton/compiler/make_launcher.py
index 52a8f74a11eb..84752c942ce6 100644
--- a/python/triton/compiler/make_launcher.py
+++ b/python/triton/compiler/make_launcher.py
@@ -25,7 +25,7 @@ def make_stub(name, signature, constants, ids, **kwargs):
     # name of files that are cached
     so_cache_key = make_so_cache_key(get_cuda_version_key(), signature, constants, ids, **kwargs)
     so_cache_manager = get_cache_manager(so_cache_key)
-    so_name = f"{name}.so"
+    so_name = f'{name}.{"so" if os.name != "nt" else "pyd"}'
     # retrieve stub from cache if it exists
     cache_path = so_cache_manager.get_file(so_name)
     if cache_path is None:
diff --git a/python/triton/runtime/driver.py b/python/triton/runtime/driver.py
index bf158bbb0e54..fc5c6542a1c4 100644
--- a/python/triton/runtime/driver.py
+++ b/python/triton/runtime/driver.py
@@ -38,7 +38,7 @@ def __init__(self):
         src = Path(os.path.join(dirname, "backends", "cuda.c")).read_text()
         key = hashlib.md5(src.encode("utf-8")).hexdigest()
         cache = get_cache_manager(key)
-        fname = "cuda_utils.so"
+        fname = "cuda_utils." + ("so" if os.name != "nt" else "pyd")
         cache_path = cache.get_file(fname)
         if cache_path is None:
             with tempfile.TemporaryDirectory() as tmpdir:

From 4394973f3e1bb09c2f1225c462cf6f12ea323dce Mon Sep 17 00:00:00 2001
From: Won-Kyu Park <wkpark@gmail.com>
Date: Sat, 2 Dec 2023 03:45:53 +0900
Subject: [PATCH 2/8] dlopen fix for win32

 * based on Windows support PR #2456 by @andreigh
 * DISPATCH_ARGS fix by @andreigh
 * WIN32 fix using LoadLibrary
---
 python/triton/compiler/make_launcher.py | 25 ++++++++++++++++++
 python/triton/runtime/backends/cuda.c   | 35 +++++++++++++++++++++++++
 2 files changed, 60 insertions(+)

diff --git a/python/triton/compiler/make_launcher.py b/python/triton/compiler/make_launcher.py
index 84752c942ce6..e716e70253fe 100644
--- a/python/triton/compiler/make_launcher.py
+++ b/python/triton/compiler/make_launcher.py
@@ -109,7 +109,12 @@ def format_of(ty):
 #include \"cuda.h\"
 #include <stdbool.h>
 #include <Python.h>
+#ifndef _WIN32
 #include <dlfcn.h>
+#else
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
 
 static inline void gpuAssert(CUresult code, const char *file, int line)
 {{
@@ -132,6 +137,7 @@ def format_of(ty):
 
 typedef CUresult (*cuLaunchKernelEx_t)(const CUlaunchConfig* config, CUfunction f, void** kernelParams, void** extra);
 
+#ifndef _WIN32
 static cuLaunchKernelEx_t getLaunchKernelExHandle() {{
   // Open the shared library
   void* handle = dlopen("libcuda.so", RTLD_LAZY);
@@ -150,6 +156,25 @@ def format_of(ty):
   }}
   return cuLaunchKernelExHandle;
 }}
+#else
+static cuLaunchKernelEx_t getLaunchKernelExHandle() {{
+  // Open the shared library
+  HMODULE handle = LoadLibraryA("nvcuda.dll");
+  if (!handle) {{
+    PyErr_SetString(PyExc_RuntimeError, "Failed to open nvcuda.dll");
+    return NULL;
+  }}
+  cuLaunchKernelEx_t cuLaunchKernelExHandle =
+      (cuLaunchKernelEx_t)GetProcAddress((HMODULE)handle, "cuLaunchKernelEx");
+  // Check for errors
+  long error = GetLastError();
+  if (error) {{
+    PyErr_SetString(PyExc_RuntimeError, "Failed to retrieve cuLaunchKernelEx from nvcuda.dll");
+    return NULL;
+  }}
+  return cuLaunchKernelExHandle;
+}}
+#endif
 
 static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, CUstream stream, CUfunction function{', ' + arg_decls if len(arg_decls) > 0 else ''}) {{
   void *params[] = {{ {', '.join(f"&arg{i}" for i in params)} }};
diff --git a/python/triton/runtime/backends/cuda.c b/python/triton/runtime/backends/cuda.c
index 0b6fdcddbaee..4c08cdcb24c5 100644
--- a/python/triton/runtime/backends/cuda.c
+++ b/python/triton/runtime/backends/cuda.c
@@ -1,5 +1,10 @@
 #include "cuda.h"
+#ifndef _WIN32
 #include <dlfcn.h>
+#else
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
 #include <stdbool.h>
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
@@ -94,10 +99,17 @@ static bool gpuAssert(CUresult code, const char *file, int line) {
 #define DISPATCH_ARGS_N(_14, _13, _12, _11, _10, _9, _8, _7, _6, _5, _4, _3,   \
                         _2, _1, N, ...)                                        \
   ADD_ENUM_ITEM_##N
+#if !defined(_MSC_VER) || defined(__clang__)
 #define DISPATCH_ARGS(...)                                                     \
   DISPATCH_ARGS_N(__VA_ARGS__, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,  \
                   0)                                                           \
   (__VA_ARGS__)
+#else
+#define EXPAND_ARGS(args) args
+#define DISPATCH_ARGS(...)                                                     \
+  DISPATCH_ARGS_N EXPAND_ARGS((__VA_ARGS__, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, \
+                               4, 3, 2, 1, 0))(__VA_ARGS__)
+#endif
 
 #define ADD_ENUM_TO_MODULE(module, enum_name, ...)                             \
   do {                                                                         \
@@ -377,6 +389,7 @@ typedef CUresult (*cuTensorMapEncodeTiled_t)(
     CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion,
     CUtensorMapFloatOOBfill oobFill);
 
+#ifndef _WIN32
 static cuTensorMapEncodeTiled_t getCuTensorMapEncodeTiledHandle() {
   // Open the shared library
   void *handle = dlopen("libcuda.so", RTLD_LAZY);
@@ -398,6 +411,28 @@ static cuTensorMapEncodeTiled_t getCuTensorMapEncodeTiledHandle() {
   }
   return cuTensorMapEncodeTiledHandle;
 }
+#else
+static cuTensorMapEncodeTiled_t getCuTensorMapEncodeTiledHandle() {
+  // Open the shared library
+  HMODULE handle = LoadLibraryA("nvcuda.dll");
+  if (!handle) {
+    PyErr_SetString(PyExc_RuntimeError, "Failed to open nvcuda.dll");
+    return NULL;
+  }
+  cuTensorMapEncodeTiled_t cuTensorMapEncodeTiledHandle =
+      (cuTensorMapEncodeTiled_t)GetProcAddress((HMODULE)handle,
+                                               "cuTensorMapEncodeTiled");
+  // Check for errors
+  long error = GetLastError();
+  if (error) {
+    PyErr_SetString(
+        PyExc_RuntimeError,
+        "Failed to retrieve cuTensorMapEncodeTiled from nvcuda.dll");
+    return NULL;
+  }
+  return cuTensorMapEncodeTiledHandle;
+}
+#endif
 
 static PyObject *tensorMapEncodeTiled(PyObject *self, PyObject *args) {
   CUtensorMap *tensorMap = (CUtensorMap *)malloc(sizeof(CUtensorMap));

From 12812fbd8cfdf2996a310861a67aa1da5fa7ade2 Mon Sep 17 00:00:00 2001
From: Won-Kyu Park <wkpark@gmail.com>
Date: Fri, 1 Dec 2023 14:53:43 +0900
Subject: [PATCH 3/8] fix compile error

clang error
"(aka 'long long') must match previous return type 'long' when lambda expression has unspecified explicit return typ"
---
 .../TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
index f6e3e364bac6..34258db114fa 100644
--- a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
@@ -709,8 +709,8 @@ void mlir::triton::asyncLaunchDots(scf::ForOp forOp) {
       lastOp = op;
       op = op->getBlock()->getParentOp();
     }
-    return std::distance(lastOp->getBlock()->getParent()->begin(),
-                         lastOp->getBlock()->getIterator());
+    return (long)std::distance(lastOp->getBlock()->getParent()->begin(),
+                               lastOp->getBlock()->getIterator());
   };
   /// XXX(Keren): Clean up the following duplicate code with checkDotOp
   /// dots to be pipelined

From 17b301098406d3205dbd535a2592d890dcf56343 Mon Sep 17 00:00:00 2001
From: Won-Kyu Park <wkpark@gmail.com>
Date: Mon, 4 Dec 2023 00:35:00 +0900
Subject: [PATCH 4/8] unit/runtime/*.py fix for windows

---
 python/test/unit/runtime/test_cache.py   | 2 +-
 python/test/unit/runtime/test_subproc.py | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/test/unit/runtime/test_cache.py b/python/test/unit/runtime/test_cache.py
index cd589fa920f5..339dc25e617a 100644
--- a/python/test/unit/runtime/test_cache.py
+++ b/python/test/unit/runtime/test_cache.py
@@ -70,7 +70,7 @@ def test_nested1_change():
 
 
 def write_and_load_module(code, num_extra_lines):
-    with tempfile.NamedTemporaryFile(mode='w+', suffix='.py') as f:
+    with tempfile.NamedTemporaryFile(mode='w+', suffix='.py', delete=False) as f:
         f.write(('# extra line\n' * num_extra_lines) + code)
         f.flush()
         spec = importlib.util.spec_from_file_location("module.name", f.name)
diff --git a/python/test/unit/runtime/test_subproc.py b/python/test/unit/runtime/test_subproc.py
index 63401f28e42b..d0ecd771384f 100644
--- a/python/test/unit/runtime/test_subproc.py
+++ b/python/test/unit/runtime/test_subproc.py
@@ -34,11 +34,15 @@ def kernel_sub(a, b, o, N: tl.constexpr):
 
 
 def test_compile_in_subproc() -> None:
+    import os
     major, minor = torch.cuda.get_device_capability(0)
     cc = major * 10 + minor
     config = triton.compiler.AttrsDescriptor(tuple(range(4)), (), (), ())
 
-    multiprocessing.set_start_method('fork')
+    if os.name == "nt":
+        multiprocessing.set_start_method('spawn')
+    else:
+        multiprocessing.set_start_method('fork')
     proc = multiprocessing.Process(target=compile_fn, args=(config, cc))
     proc.start()
     proc.join()
@@ -64,7 +68,7 @@ def test_compile_in_forked_subproc() -> None:
     capability = major * 10 + minor
     config = triton.compiler.AttrsDescriptor(tuple(range(1)), (), (), ())
 
-    assert multiprocessing.get_start_method() == 'fork'
+    assert multiprocessing.get_start_method() in ['fork', 'spawn']
     proc = multiprocessing.Process(target=compile_fn_dot, args=(config, capability))
     proc.start()
     proc.join()

From 360bc757857d0cae2e2b8c27d13680f8e7ffc8ee Mon Sep 17 00:00:00 2001
From: Won-Kyu Park <wkpark@gmail.com>
Date: Mon, 4 Dec 2023 12:47:09 +0900
Subject: [PATCH 5/8] fix MANIFEST.in

 * fix warning "warning: manifest_maker: MANIFEST.in, line 4: path 'triton/runtime/backends/' cannot end with '/'"
---
 python/MANIFEST.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/MANIFEST.in b/python/MANIFEST.in
index c8c818919820..f46c7ce67dee 100644
--- a/python/MANIFEST.in
+++ b/python/MANIFEST.in
@@ -1,5 +1,5 @@
 graft src
 graft triton/third_party
 graft triton/tools
-graft triton/runtime/backends/
+graft triton/runtime/backends
 graft triton/language/extra

From bc260e320290afcfd09521b07670d5abe2c89123 Mon Sep 17 00:00:00 2001
From: Won-Kyu Park <wkpark@gmail.com>
Date: Tue, 5 Dec 2023 01:40:04 +0900
Subject: [PATCH 6/8] update llvm-hash

---
 cmake/llvm-hash.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/llvm-hash.txt b/cmake/llvm-hash.txt
index 74f95be57c3f..506504162ce0 100644
--- a/cmake/llvm-hash.txt
+++ b/cmake/llvm-hash.txt
@@ -1 +1 @@
-5e5a22caf88ac1ccfa8dc5720295fdeba0ad9372
+668865789620f390fbad4d7093ed8ca6eb932c31

From b40c41153fad33d824138246c2ecb8d5c9501d3a Mon Sep 17 00:00:00 2001
From: Won-Kyu Park <wkpark@gmail.com>
Date: Mon, 4 Dec 2023 23:05:23 +0900
Subject: [PATCH 7/8] use windows llvm build

---
 python/setup.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/setup.py b/python/setup.py
index 180ce68f7676..3ba6748e849d 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -86,6 +86,10 @@ def get_llvm_package_info():
             vglibc = tuple(map(int, platform.libc_ver()[1].split('.')))
             vglibc = vglibc[0] * 100 + vglibc[1]
             system_suffix = 'ubuntu-x64' if vglibc > 217 else 'centos-x64'
+    elif system == "Windows":
+        if arch == "AMD64":
+            arch = "x64"
+        system_suffix = f"windows-{arch}"
     else:
         return Package("llvm", "LLVM-C.lib", "", "LLVM_INCLUDE_DIRS", "LLVM_LIBRARY_DIR", "LLVM_SYSPATH")
     # use_assert_enabled_llvm = check_env_flag("TRITON_USE_ASSERT_ENABLED_LLVM", "False")

From 99378a45f670603a104fc8af26aa926ed70fb5e6 Mon Sep 17 00:00:00 2001
From: Won-Kyu Park <wkpark@gmail.com>
Date: Thu, 30 Nov 2023 08:10:20 +0900
Subject: [PATCH 8/8] support windows-latest

 * use conda for ubuntu-latest
 * enable windows-latest build
 * disable artifact check for non self-hosted
 * build wheels and upload dist artifacts
 * update build matrix with python-version, etc.
 * fix deprecated ::set-output, etc.
---
 .github/workflows/integration-tests.yml | 166 +++++++++++++++++++++---
 environment.yml                         |  18 +++
 2 files changed, 166 insertions(+), 18 deletions(-)
 create mode 100644 environment.yml

diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
index b13c74801ee3..cff51adffb0a 100644
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@@ -23,26 +23,25 @@ jobs:
       matrix-required: ${{ steps.set-matrix.outputs.matrix-required }}
       matrix-optional: ${{ steps.set-matrix.outputs.matrix-optional }}
     steps:
-      - name: Prepare runner matrix
+      - name: Prepare matrix
         id: set-matrix
         run: |
           if [ x"${{ github.repository }}" == x"openai/triton" ]; then
-            echo '::set-output name=matrix-required::[["self-hosted", "A100"], ["self-hosted", "H100"]]'
-            echo '::set-output name=matrix-optional::[["self-hosted", "gfx908"], ["self-hosted", "arc770"]]'
+            echo 'matrix-required={"runner": [["self-hosted", "A100"], ["self-hosted", "H100"]], "python-version": ["3.11"], "cuda-version": ["12.1"], "cc": ["clang"]}' >> "$GITHUB_OUTPUT"
+            echo 'matrix-optional={"runner": [["self-hosted", "gfx908"], ["self-hosted", "arc770"]], "python-version": ["3.11"], "cuda-version": ["12.1"], "cc": ["clang"]}' >> "$GITHUB_OUTPUT"
           else
-            echo '::set-output name=matrix-required::["ubuntu-latest"]'
-            echo '::set-output name=matrix-optional::["ubuntu-latest"]'
+            echo 'matrix-required={"runner":["ubuntu-latest", "windows-latest"], "python-version": ["3.10", "3.11"], "cuda-version": ["12.1"], "cc": ["clang"]}' >> "$GITHUB_OUTPUT"
+            echo 'matrix-optional={"runner":["ubuntu-latest", "windows-latest"], "python-version": ["3.10", "3.11"], "cuda-version": ["12.1"], "cc": ["clang"]}' >> "$GITHUB_OUTPUT"
           fi
 
   Integration-Tests:
     needs: Runner-Preparation
 
     runs-on: ${{ matrix.runner }}
-    timeout-minutes: 20
+    timeout-minutes: 60
 
     strategy:
-      matrix:
-        runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-required)}}
+      matrix: ${{fromJson(needs.Runner-Preparation.outputs.matrix-required)}}
 
     steps:
       - name: Checkout
@@ -56,11 +55,119 @@ jobs:
           echo "ENABLE_TMA=0" >> "${GITHUB_ENV}"
           echo "TRITON_DISABLE_LINE_INFO=1" >> "${GITHUB_ENV}"
 
+      - name: Set up Python ${{ matrix.python-version }}
+        if: matrix.runner[0] == 'self-hosted'
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Set up MSVC
+        if: matrix.runner == 'windows-latest'
+        uses: ilammy/msvc-dev-cmd@v1.4.1
+        with:
+          arch: amd64
+
+      - name: Setup Mambaforge (Windows)
+        if: matrix.runner == 'windows-latest'
+        uses: conda-incubator/setup-miniconda@v3
+        with:
+          miniforge-variant: Mambaforge
+          miniforge-version: latest
+          activate-environment: triton-env
+          use-mamba: true
+
+      - uses: conda-incubator/setup-miniconda@v3
+        if: matrix.runner == 'windows-latest'
+        with:
+          activate-environment: triton-env
+          environment-file: environment.yml
+          auto-activate-base: true
+          python-version: ${{ matrix.python-version }}
+
+      - name: set Environment Variables (Windows)
+        if: matrix.runner == 'windows-latest'
+        shell: bash -el {0}
+        run: |
+          LLVM_SHORTHASH="$(cat cmake/llvm-hash.txt | cut -c1-8)"
+          # prepare LLVM prebuilt path. will be downloaded and extracted by setup.py step
+          echo "~/.triton/llvm/llvm-$LLVM_SHORTHASH-windows-x64/bin" >> "$GITHUB_PATH"
+          # compile with a selected matrix.cc
+          if [ "${{matrix.cc}}" = "cl" ]; then
+            echo "CC=cl" >> "${GITHUB_ENV}"
+            echo "CXX=cl" >> "${GITHUB_ENV}"
+          elif [ "${{matrix.cc}}" = "clang" ]; then
+            echo "CC=clang" >> "${GITHUB_ENV}"
+            echo "CXX=clang++" >> "${GITHUB_ENV}"
+          fi
+
+      - name: CUDA toolkit ${{ matrix.cuda-version }}
+        shell: bash -el {0}
+        if: matrix.runner[0] != 'self-hosted'
+        run: |
+          if [ "${{ matrix.runner }}" = "ubuntu-latest" ]; then
+            # prepare space for ubuntu
+            sudo rm -rf /usr/share/dotnet
+            sudo rm -rf /opt/ghc
+          fi
+
+          addon=""
+          cuda_version=${{ matrix.cuda-version }}
+          [ "$cuda_version" = "12.1" ] && cuda_version="12.1.1" && addon="cuda-cudart-static cuda-nvrtc"
+          [ "$cuda_version" = "11.8" ] && cuda_version="11.8.0"
+
+          conda install cuda-libraries-dev cuda-nvcc cuda-nvtx cuda-cupti cuda-cudart cuda-cudart-dev cuda-runtime cuda-libraries $addon -c "nvidia/label/cuda-$cuda_version"
+
+      - name: Get Date (Windows)
+        if: matrix.runner == 'windows-latest'
+        id: get-date
+        run: echo "today=$(date -u '+%Y%m%d')" >> $GITHUB_OUTPUT
+        shell: bash
+
+      - name: Cache conda env (Windows)
+        if: matrix.runner == 'windows-latest'
+        id: cache
+        uses: actions/cache@v3
+        env:
+          # Increase this value to reset cache if environment.yml has not changed
+          CACHE_NUMBER: 0
+        with:
+          path: ${{ env.CONDA }}/envs
+          key:
+            ${{ matrix.runner }}--${{ steps.get-date.outputs.today }}--conda-${{ env.CACHE_NUMBER }}-cp${{ matrix.python-version }}-${{ hashFiles('environment.yml') }}
+
+
+      - name: Update conda environment (Windows)
+        if: ${{(matrix.runner == 'windows-latest')}}
+        shell: bash -el {0}
+        run: |
+            if [ "${{ steps.cache.outputs.cache-hit }}" != "true" ]; then
+                mamba env update -n triton-env -f environment.yml
+                cat environment.yml
+            fi
+
+      - name: Update environment
+        if: matrix.runner[0] != 'self-hosted'
+        shell: bash
+        run: |
+          echo "BACKEND=CUDA" >> "${GITHUB_ENV}"
+          echo "ENABLE_TMA=0" >> "${GITHUB_ENV}"
+          echo "TRITON_DISABLE_LINE_INFO=1" >> "${GITHUB_ENV}"
+
+
+      - name: Set reusable strings
+        # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file.
+        id: strings
+        shell: bash
+        run: |
+          echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT"
+
       - name: Clear cache
+        shell: bash
         run: |
           rm -rf ~/.triton
 
       - name: Update PATH
+        if: matrix.runner[0] == 'self-hosted'
         run: |
           echo "PATH=${HOME}/.local/bin:${PATH}" >> "${GITHUB_ENV}"
 
@@ -70,17 +177,29 @@ jobs:
           python3 -m pre_commit run --all-files --verbose
 
       - name: Install Triton
-        if: ${{ env.BACKEND == 'CUDA'}}
+        if: matrix.runner != 'windows-latest'
         run: |
           cd python
           python3 -m pip install --upgrade pip
-          python3 -m pip install cmake==3.24 ninja pytest-xdist
+          python3 -m pip install cmake==3.24 ninja pytest-xdist wheel
           sudo apt-get update -y
           sudo apt-get install -y ccache clang lld
           TRITON_BUILD_WITH_CLANG_LLD=true TRITON_BUILD_WITH_CCACHE=true python3 -m pip install --no-build-isolation -vvv '.[tests]'
+          if [ "${{ matrix.runner }}" = 'ubuntu-latest' ]; then
+            python3 setup.py bdist_wheel
+          fi
+
+      - name: Install Triton (Windows)
+        if: matrix.runner == 'windows-latest'
+        run: |
+          cd python
+          python3 -m pip install --upgrade pip
+          python3 -m pip install cmake==3.24 ninja pytest-xdist wheel
+          python3 -m pip install --no-build-isolation -vvv .
+          python3 setup.py bdist_wheel
 
       - name: Run lit tests
-        if: ${{ env.BACKEND == 'CUDA'}}
+        if: matrix.runner[0] == 'self-hosted' && env.BACKEND == 'CUDA'
         run: |
           python3 -m pip install lit
           cd python
@@ -96,7 +215,7 @@ jobs:
           echo "ENABLE_TMA=1" >> "${GITHUB_ENV}"
 
       - name: Run python tests on CUDA with ENABLE_TMA=1
-        if: ${{ env.BACKEND == 'CUDA' && env.ENABLE_TMA == '1'}}
+        if: ${{(matrix.runner[0] == 'self-hosted') && env.BACKEND == 'CUDA' && env.ENABLE_TMA == '1'}}
         run: |
           cd python/test/unit
           python3 -m pytest -n 8 --ignore=runtime --ignore=operators --ignore=language/test_line_info.py --ignore=language/test_subprocess.py
@@ -109,7 +228,7 @@ jobs:
           python3 -m pytest hopper/test_flashattention.py
 
       - name: Run python tests on CUDA with ENABLE_TMA=0
-        if: ${{ env.BACKEND == 'CUDA' && env.ENABLE_TMA == '0'}}
+        if: ${{(matrix.runner[0] == 'self-hosted') && env.BACKEND == 'CUDA' && env.ENABLE_TMA == '0'}}
         run: |
           cd python/test/unit
           python3 -m pytest -n 8 --ignore=runtime --ignore=hopper --ignore=operators --ignore=language/test_line_info.py
@@ -119,10 +238,12 @@ jobs:
           TRITON_DISABLE_LINE_INFO=0 python3 -m pytest language/test_line_info.py
 
       - name: Clear cache
+        shell: bash
         run: |
           rm -rf ~/.triton
 
       - name: Run interpreter tests
+        if: matrix.runner[0] == 'self-hosted'
         env:
           # TRITON_INTERPRET: "1"
           CUA_VISIBLE_DEVICES: ""
@@ -131,17 +252,25 @@ jobs:
           python3 -m pytest -vs operators/test_flash_attention.py
 
       - name: Run partial tests on CUDA with ENABLE_TMA=1
-        if: ${{ env.BACKEND == 'CUDA' && env.ENABLE_TMA == '1'}}
+        if: ${{(matrix.runner[0] == 'self-hosted') && env.BACKEND == 'CUDA' && env.ENABLE_TMA == '1'}}
         run: |
           cd python/test/unit
           python3 -m pytest -n 8 operators
 
       - name: Run partial tests on CUDA with ENABLE_TMA=0
-        if: ${{ env.BACKEND == 'CUDA' && env.ENABLE_TMA == '0'}}
+        if: ${{(matrix.runner[0] == 'self-hosted') && env.BACKEND == 'CUDA' && env.ENABLE_TMA == '0'}}
         run: |
           cd python/test/unit
           python3 -m pytest -n 8 operators
 
+      - name: Upload Build artifacts
+        if: matrix.runner[0] != 'self-hosted'
+        uses: actions/upload-artifact@v3
+        with:
+          name: triton-dist ${{ matrix.runner }}
+          path: |
+            ${{ github.workspace }}/python/dist/
+
       - name: Create artifacts archive
         if: ${{(matrix.runner[0] == 'self-hosted') && (matrix.runner[1] == 'V100' || matrix.runner[1] == 'A100' || matrix.runner[1] == 'H100')}}
         run: |
@@ -150,20 +279,20 @@ jobs:
 
       - name: Upload artifacts archive
         if: ${{(matrix.runner[0] == 'self-hosted') && (matrix.runner[1] == 'V100' || matrix.runner[1] == 'A100' || matrix.runner[1] == 'H100')}}
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v3
         with:
           name: artifacts ${{ matrix.runner[1] }}
           path: ~/.triton/artifacts.tar.gz
 
       - name: Run CXX unittests
-        if: ${{ env.BACKEND == 'CUDA'}}
+        if: ${{(matrix.runner[0] == 'self-hosted') && env.BACKEND == 'CUDA'}}
         run: |
           cd python
           cd "build/$(ls build | grep -i cmake)"
           ctest
 
       - name: Regression tests
-        if: ${{ contains(matrix.runner, 'A100') }}
+        if: ${{ (matrix.runner[0] == 'self-hosted') && contains(matrix.runner, 'A100') }}
         run: |
           python3 -m pip install pytest-rerunfailures
           cd python/test/regression
@@ -173,6 +302,7 @@ jobs:
           sudo nvidia-smi -i 0 -rgc
 
   Compare-artifacts:
+    if: ${{(github.repository == 'openai/triton')}}
     needs: Integration-Tests
     timeout-minutes: 5
 
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 000000000000..031020b365bb
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,18 @@
+name: triton
+channels:
+  - conda-forge
+  - pytorch
+dependencies:
+  - python
+  - pytest
+  - pytorch
+  - torchaudio
+  - torchvision
+  - typer
+  - ca-certificates
+  - certifi
+  - openssl
+  - zlib
+  - zstd
+  - llvm>=17.0
+  - mlir>=17.0