diff --git a/pkgs/by-name/tr/triton-llvm/package.nix b/pkgs/by-name/tr/triton-llvm/package.nix index d45aa2fafe65f..829f0c912afd2 100644 --- a/pkgs/by-name/tr/triton-llvm/package.nix +++ b/pkgs/by-name/tr/triton-llvm/package.nix @@ -1,6 +1,7 @@ { lib , stdenv , fetchFromGitHub +, fetchpatch , pkgsBuildBuild , pkg-config , cmake @@ -11,6 +12,7 @@ , libedit , libffi , libpfm +, lit , mpfr , zlib , ncurses @@ -45,7 +47,7 @@ let isNative = stdenv.hostPlatform == stdenv.buildPlatform; in stdenv.mkDerivation (finalAttrs: { pname = "triton-llvm"; - version = "17.0.0-c5dede880d17"; + version = "19.1.0-rc1"; # One of the tags at https://github.com/llvm/llvm-project/commit/10dc3a8e916d73291269e5e2b82dd22681489aa1 outputs = [ "out" @@ -60,9 +62,18 @@ in stdenv.mkDerivation (finalAttrs: { src = fetchFromGitHub { owner = "llvm"; repo = "llvm-project"; - rev = "c5dede880d175f7229c9b2923f4753e12702305d"; - hash = "sha256-v4r3+7XVFK+Dzxt/rErZNJ9REqFO3JmGN4X4vZ+77ew="; + rev = "10dc3a8e916d73291269e5e2b82dd22681489aa1"; + hash = "sha256-9DPvcFmhzw6MipQeCQnr35LktW0uxtEL8axMMPXIfWw="; }; + patches = [ + # glibc-2.40 support + # [llvm-exegesis] Use correct rseq struct size #100804 + # https://github.com/llvm/llvm-project/issues/100791 + (fetchpatch { + url = "https://github.com/llvm/llvm-project//commit/84837e3cc1cf17ed71580e3ea38299ed2bfaa5f6.patch"; + hash = "sha256-QKa+kyXjjGXwTQTEpmKZx5yYjOyBX8A8NQoIYUaGcIw="; + }) + ]; nativeBuildInputs = [ pkg-config @@ -74,6 +85,7 @@ in stdenv.mkDerivation (finalAttrs: { doxygen sphinx python3Packages.recommonmark + python3Packages.myst-parser ]; buildInputs = [ @@ -90,7 +102,9 @@ in stdenv.mkDerivation (finalAttrs: { ncurses ]; - sourceRoot = "${finalAttrs.src.name}/llvm"; + preConfigure = '' + cd llvm + ''; cmakeFlags = [ (lib.cmakeFeature "LLVM_TARGETS_TO_BUILD" (lib.concatStringsSep ";" llvmTargetsToBuild')) @@ -140,23 +154,25 @@ in stdenv.mkDerivation (finalAttrs: { postPatch = '' # `CMake Error: cannot write to file "/build/source/llvm/build/lib/cmake/mlir/MLIRTargets.cmake": Permission denied` - chmod +w -R ../mlir - patchShebangs ../mlir/test/mlir-reduce + chmod +w -R ./mlir + patchShebangs ./mlir/test/mlir-reduce # FileSystem permissions tests fail with various special bits - rm test/tools/llvm-objcopy/ELF/mirror-permissions-unix.test - rm unittests/Support/Path.cpp + rm llvm/test/tools/llvm-objcopy/ELF/mirror-permissions-unix.test + rm llvm/unittests/Support/Path.cpp - substituteInPlace unittests/Support/CMakeLists.txt \ + substituteInPlace llvm/unittests/Support/CMakeLists.txt \ --replace "Path.cpp" "" '' + lib.optionalString stdenv.hostPlatform.isAarch64 '' # Not sure why this fails - rm test/tools/llvm-exegesis/AArch64/latency-by-opcode-name.s + rm llvm/test/tools/llvm-exegesis/AArch64/latency-by-opcode-name.s ''; - postInstall = lib.optionalString (!isNative) '' + postInstall = '' + cp ${lib.getExe lit} $out/bin/llvm-lit + '' + (lib.optionalString (!isNative) '' cp -a NATIVE/bin/llvm-config $out/bin/llvm-config-native - ''; + ''); doCheck = buildTests; diff --git a/pkgs/development/python-modules/torch/0001-cmake.py-propagate-cmakeFlags-from-environment.patch b/pkgs/development/python-modules/torch/0001-cmake.py-propagate-cmakeFlags-from-environment.patch new file mode 100644 index 0000000000000..e30f6449c7bc5 --- /dev/null +++ b/pkgs/development/python-modules/torch/0001-cmake.py-propagate-cmakeFlags-from-environment.patch @@ -0,0 +1,29 @@ +From c5d4087519eae6f41c80bbd8ffbcc9390db44c7f Mon Sep 17 00:00:00 2001 +From: SomeoneSerge +Date: Thu, 10 Oct 2024 19:19:18 +0000 +Subject: [PATCH] cmake.py: propagate cmakeFlags from environment + +--- + tools/setup_helpers/cmake.py | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py +index 4b605fe5975..ea1d6a1ef46 100644 +--- a/tools/setup_helpers/cmake.py ++++ b/tools/setup_helpers/cmake.py +@@ -332,6 +332,12 @@ class CMake: + file=sys.stderr, + ) + print(e, file=sys.stderr) ++ ++ # Nixpkgs compat: ++ if "cmakeFlags" in os.environ: ++ import shlex ++ args.extend(shlex.split(os.environ["cmakeFlags"])) ++ + # According to the CMake manual, we should pass the arguments first, + # and put the directory as the last element. Otherwise, these flags + # may not be passed correctly. +-- +2.46.0 + diff --git a/pkgs/development/python-modules/torch/default.nix b/pkgs/development/python-modules/torch/default.nix index 7b5b8e9f6726c..00a2a66072677 100644 --- a/pkgs/development/python-modules/torch/default.nix +++ b/pkgs/development/python-modules/torch/default.nix @@ -35,10 +35,8 @@ removeReferencesTo, # Build inputs + darwin, numactl, - Accelerate, - CoreServices, - libobjc, # Propagated build inputs astunparse, @@ -56,6 +54,17 @@ tritonSupport ? (!stdenv.hostPlatform.isDarwin), triton, + # TODO: 1. callPackage needs to learn to distinguish between the task + # of "asking for an attribute from the parent scope" and + # the task of "exposing a formal parameter in .override". + # TODO: 2. We should probably abandon attributes such as `torchWithCuda` (etc.) + # as they routinely end up consuming the wrong arguments\ + # (dependencies without cuda support). + # Instead we should rely on overlays and nixpkgsFun. + # (@SomeoneSerge) + _tritonEffective ? if cudaSupport then triton-cuda else triton, + triton-cuda, + # Unit tests hypothesis, psutil, @@ -95,6 +104,8 @@ let ; inherit (cudaPackages) cudaFlags cudnn nccl; + triton = throw "python3Packages.torch: use _tritonEffective instead of triton to avoid divergence"; + rocmPackages = rocmPackages_5; setBool = v: if v then "1" else "0"; @@ -240,6 +251,7 @@ buildPythonPackage rec { # Allow setting PYTHON_LIB_REL_PATH with an environment variable. # https://github.com/pytorch/pytorch/pull/128419 ./passthrough-python-lib-rel-path.patch + ./0001-cmake.py-propagate-cmakeFlags-from-environment.patch ] ++ lib.optionals cudaSupport [ ./fix-cmake-cuda-toolkit.patch ] ++ lib.optionals (stdenv.hostPlatform.isDarwin && stdenv.hostPlatform.isx86_64) [ @@ -257,7 +269,18 @@ buildPythonPackage rec { ]; postPatch = - lib.optionalString rocmSupport '' + '' + substituteInPlace cmake/public/cuda.cmake \ + --replace-fail \ + 'message(FATAL_ERROR "Found two conflicting CUDA' \ + 'message(WARNING "Found two conflicting CUDA' \ + --replace-warn \ + "set(CUDAToolkit_ROOT" \ + "# Upstream: set(CUDAToolkit_ROOT" + substituteInPlace third_party/gloo/cmake/Cuda.cmake \ + --replace-warn "find_package(CUDAToolkit 7.0" "find_package(CUDAToolkit" + '' + + lib.optionalString rocmSupport '' # https://github.com/facebookincubator/gloo/pull/297 substituteInPlace third_party/gloo/cmake/Hipify.cmake \ --replace "\''${HIPIFY_COMMAND}" "python \''${HIPIFY_COMMAND}" @@ -351,6 +374,17 @@ buildPythonPackage rec { # NB technical debt: building without NNPACK as workaround for missing `six` USE_NNPACK = 0; + cmakeFlags = + [ + # (lib.cmakeBool "CMAKE_FIND_DEBUG_MODE" true) + (lib.cmakeFeature "CUDAToolkit_VERSION" cudaPackages.cudaVersion) + ] + ++ lib.optionals cudaSupport [ + # Unbreaks version discovery in enable_language(CUDA) when wrapping nvcc with ccache + # Cf. https://gitlab.kitware.com/cmake/cmake/-/issues/26363 + (lib.cmakeFeature "CMAKE_CUDA_COMPILER_TOOLKIT_VERSION" cudaPackages.cudaVersion) + ]; + preBuild = '' export MAX_JOBS=$NIX_BUILD_CORES ${python.pythonOnBuildForHost.interpreter} setup.py build --cmake-only @@ -495,11 +529,11 @@ buildPythonPackage rec { ++ lib.optionals (cudaSupport || rocmSupport) [ effectiveMagma ] ++ lib.optionals stdenv.hostPlatform.isLinux [ numactl ] ++ lib.optionals stdenv.hostPlatform.isDarwin [ - Accelerate - CoreServices - libobjc + darwin.apple_sdk.frameworks.Accelerate + darwin.apple_sdk.frameworks.CoreServices + darwin.libobjc ] - ++ lib.optionals tritonSupport [ triton ] + ++ lib.optionals tritonSupport [ _tritonEffective ] ++ lib.optionals MPISupport [ mpi ] ++ lib.optionals rocmSupport [ rocmtoolkit_joined ]; @@ -527,7 +561,7 @@ buildPythonPackage rec { # torch/csrc requires `pybind11` at runtime pybind11 - ] ++ lib.optionals tritonSupport [ triton ]; + ] ++ lib.optionals tritonSupport [ _tritonEffective ]; propagatedCxxBuildInputs = [ ] ++ lib.optionals MPISupport [ mpi ] ++ lib.optionals rocmSupport [ rocmtoolkit_joined ]; @@ -662,7 +696,9 @@ buildPythonPackage rec { thoughtpolice tscholak ]; # tscholak esp. for darwin-related builds - platforms = with lib.platforms; linux ++ lib.optionals (!cudaSupport && !rocmSupport) darwin; + platforms = + lib.platforms.linux + ++ lib.optionals (!cudaSupport && !rocmSupport) lib.platforms.darwin; broken = builtins.any trivial.id (builtins.attrValues brokenConditions); }; } diff --git a/pkgs/development/python-modules/triton/0000-dont-download-ptxas.patch b/pkgs/development/python-modules/triton/0000-dont-download-ptxas.patch deleted file mode 100644 index d31a4798af05c..0000000000000 --- a/pkgs/development/python-modules/triton/0000-dont-download-ptxas.patch +++ /dev/null @@ -1,15 +0,0 @@ -diff --git a/python/setup.py b/python/setup.py -index 18764ec13..b3bb5b60a 100644 ---- a/python/setup.py -+++ b/python/setup.py -@@ -269,10 +269,6 @@ class CMakeBuild(build_ext): - subprocess.check_call(["cmake", self.base_dir] + cmake_args, cwd=cmake_dir, env=env) - subprocess.check_call(["cmake", "--build", "."] + build_args, cwd=cmake_dir) - -- --download_and_copy_ptxas() -- -- - setup( - name="triton", - version="2.1.0", diff --git a/pkgs/development/python-modules/triton/0001-_build-allow-extra-cc-flags.patch b/pkgs/development/python-modules/triton/0001-_build-allow-extra-cc-flags.patch new file mode 100644 index 0000000000000..1e473dc59f46a --- /dev/null +++ b/pkgs/development/python-modules/triton/0001-_build-allow-extra-cc-flags.patch @@ -0,0 +1,35 @@ +From 2751c5de5c61c90b56e3e392a41847f4c47258fd Mon Sep 17 00:00:00 2001 +From: SomeoneSerge +Date: Sun, 13 Oct 2024 14:16:48 +0000 +Subject: [PATCH 1/3] _build: allow extra cc flags + +--- + python/triton/runtime/build.py | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +diff --git a/python/triton/runtime/build.py b/python/triton/runtime/build.py +index d7baeb286..d334dce77 100644 +--- a/python/triton/runtime/build.py ++++ b/python/triton/runtime/build.py +@@ -42,9 +42,17 @@ def _build(name, src, srcdir, library_dirs, include_dirs, libraries): + py_include_dir = sysconfig.get_paths(scheme=scheme)["include"] + include_dirs = include_dirs + [srcdir, py_include_dir] + cc_cmd = [cc, src, "-O3", "-shared", "-fPIC", "-o", so] ++ ++ # Nixpkgs support branch ++ # Allows passing e.g. extra -Wl,-rpath ++ cc_cmd_extra_flags = "@ccCmdExtraFlags@" ++ if cc_cmd_extra_flags != ("@" + "ccCmdExtraFlags@"): # substituteAll hack ++ import shlex ++ cc_cmd.extend(shlex.split(cc_cmd_extra_flags)) ++ + cc_cmd += [f'-l{lib}' for lib in libraries] + cc_cmd += [f"-L{dir}" for dir in library_dirs] +- cc_cmd += [f"-I{dir}" for dir in include_dirs] ++ cc_cmd += [f"-I{dir}" for dir in include_dirs if dir is not None] + ret = subprocess.check_call(cc_cmd) + if ret == 0: + return so +-- +2.46.0 + diff --git a/pkgs/development/python-modules/triton/0001-ptxas-disable-version-key-for-non-cuda-targets.patch b/pkgs/development/python-modules/triton/0001-ptxas-disable-version-key-for-non-cuda-targets.patch deleted file mode 100644 index 3941d54b8b37f..0000000000000 --- a/pkgs/development/python-modules/triton/0001-ptxas-disable-version-key-for-non-cuda-targets.patch +++ /dev/null @@ -1,27 +0,0 @@ -From 10f3d49aa6084d1b9b9624017cce7df106b9fb7e Mon Sep 17 00:00:00 2001 -From: Yaroslav Bolyukin -Date: Tue, 6 Feb 2024 13:51:28 +0100 -Subject: [PATCH] ptxas: disable version key for non-cuda targets - ---- - python/triton/runtime/jit.py | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/python/triton/runtime/jit.py b/python/triton/runtime/jit.py -index d55972b4b..bd875a701 100644 ---- a/python/triton/runtime/jit.py -+++ b/python/triton/runtime/jit.py -@@ -117,8 +117,8 @@ def version_key(): - with open(lib.module_finder.find_spec(lib.name).origin, "rb") as f: - contents += [hashlib.md5(f.read()).hexdigest()] - # ptxas version -- ptxas = path_to_ptxas()[0] -- ptxas_version = hashlib.md5(subprocess.check_output([ptxas, "--version"])).hexdigest() -+ # ptxas = path_to_ptxas()[0] -+ ptxas_version = "noptxas" - return '-'.join(TRITON_VERSION) + '-' + ptxas_version + '-' + '-'.join(contents) - - --- -2.43.0 - diff --git a/pkgs/development/python-modules/triton/0001-setup.py-introduce-TRITON_OFFLINE_BUILD.patch b/pkgs/development/python-modules/triton/0001-setup.py-introduce-TRITON_OFFLINE_BUILD.patch new file mode 100644 index 0000000000000..5b195fd7f8821 --- /dev/null +++ b/pkgs/development/python-modules/triton/0001-setup.py-introduce-TRITON_OFFLINE_BUILD.patch @@ -0,0 +1,64 @@ +From 587d1f3428eca63544238802f19e0be670d03244 Mon Sep 17 00:00:00 2001 +From: SomeoneSerge +Date: Mon, 29 Jul 2024 14:31:11 +0000 +Subject: [PATCH] setup.py: introduce TRITON_OFFLINE_BUILD + +To prevent any vendoring whatsoever +--- + python/setup.py | 26 ++++++++++++++++++++++++-- + 1 file changed, 24 insertions(+), 2 deletions(-) + +diff --git a/python/setup.py b/python/setup.py +index 73800ec40..4e5b04de4 100644 +--- a/python/setup.py ++++ b/python/setup.py +@@ -112,6 +112,20 @@ def get_env_with_keys(key: list): + return os.environ[k] + return "" + ++def is_offline_build() -> bool: ++ """ ++ Downstream projects and distributions which bootstrap their own dependencies from scratch ++ and run builds in offline sandboxes ++ may set `TRITON_OFFLINE_BUILD` in the build environment to prevent any attempts at downloading ++ pinned dependencies from the internet or at using dependencies vendored in-tree. ++ ++ Dependencies must be defined using respective search paths (cf. `syspath_var_name` in `Package`). ++ Missing dependencies lead to an early abortion. ++ Dependencies' compatibility is not verified. ++ ++ Note that this flag isn't tested by the CI and does not provide any guarantees. ++ """ ++ return os.environ.get("TRITON_OFFLINE_BUILD", "") != "" + + # --- third party packages ----- + +@@ -220,8 +234,14 @@ def get_thirdparty_packages(packages: list): + if os.environ.get(p.syspath_var_name): + package_dir = os.environ[p.syspath_var_name] + version_file_path = os.path.join(package_dir, "version.txt") +- if p.syspath_var_name not in os.environ and\ +- (not os.path.exists(version_file_path) or Path(version_file_path).read_text() != p.url): ++ ++ input_defined = p.syspath_var_name not in os.environ ++ input_exists = input_defined and os.path.exists(version_file_path) ++ input_compatible = input_exists and Path(version_file_path).read_text() == p.url ++ ++ if is_offline_build() and not input_defined: ++ raise RuntimeError(f"Requested an offline build but {p.syspath_var_name} is not set") ++ if not is_offline_build() and not input_compatible: + with contextlib.suppress(Exception): + shutil.rmtree(package_root_dir) + os.makedirs(package_root_dir, exist_ok=True) +@@ -245,6 +265,8 @@ def get_thirdparty_packages(packages: list): + + + def download_and_copy(name, src_path, variable, version, url_func): ++ if is_offline_build(): ++ return + triton_cache_path = get_triton_cache_path() + if variable in os.environ: + return +-- +2.45.1 + diff --git a/pkgs/development/python-modules/triton/0002-nvidia-amd-driver-short-circuit-before-ldconfig.patch b/pkgs/development/python-modules/triton/0002-nvidia-amd-driver-short-circuit-before-ldconfig.patch new file mode 100644 index 0000000000000..aa65cad58ed81 --- /dev/null +++ b/pkgs/development/python-modules/triton/0002-nvidia-amd-driver-short-circuit-before-ldconfig.patch @@ -0,0 +1,70 @@ +From 7407cb03eec82768e333909d87b7668b633bfe86 Mon Sep 17 00:00:00 2001 +From: SomeoneSerge +Date: Sun, 13 Oct 2024 14:28:48 +0000 +Subject: [PATCH 2/3] {nvidia,amd}/driver: short-circuit before ldconfig + +--- + python/triton/runtime/build.py | 6 +++--- + third_party/amd/backend/driver.py | 7 +++++++ + third_party/nvidia/backend/driver.py | 3 +++ + 3 files changed, 13 insertions(+), 3 deletions(-) + +diff --git a/python/triton/runtime/build.py b/python/triton/runtime/build.py +index d334dce77..a64e98da0 100644 +--- a/python/triton/runtime/build.py ++++ b/python/triton/runtime/build.py +@@ -42,6 +42,9 @@ def _build(name, src, srcdir, library_dirs, include_dirs, libraries): + py_include_dir = sysconfig.get_paths(scheme=scheme)["include"] + include_dirs = include_dirs + [srcdir, py_include_dir] + cc_cmd = [cc, src, "-O3", "-shared", "-fPIC", "-o", so] ++ cc_cmd += [f'-l{lib}' for lib in libraries] ++ cc_cmd += [f"-L{dir}" for dir in library_dirs] ++ cc_cmd += [f"-I{dir}" for dir in include_dirs if dir is not None] + + # Nixpkgs support branch + # Allows passing e.g. extra -Wl,-rpath +@@ -50,9 +53,6 @@ def _build(name, src, srcdir, library_dirs, include_dirs, libraries): + import shlex + cc_cmd.extend(shlex.split(cc_cmd_extra_flags)) + +- cc_cmd += [f'-l{lib}' for lib in libraries] +- cc_cmd += [f"-L{dir}" for dir in library_dirs] +- cc_cmd += [f"-I{dir}" for dir in include_dirs if dir is not None] + ret = subprocess.check_call(cc_cmd) + if ret == 0: + return so +diff --git a/third_party/amd/backend/driver.py b/third_party/amd/backend/driver.py +index 0a8cd7bed..aab8805f6 100644 +--- a/third_party/amd/backend/driver.py ++++ b/third_party/amd/backend/driver.py +@@ -24,6 +24,13 @@ def _get_path_to_hip_runtime_dylib(): + return env_libhip_path + raise RuntimeError(f"TRITON_LIBHIP_PATH '{env_libhip_path}' does not point to a valid {lib_name}") + ++ # ...on release/3.1.x: ++ # return mmapped_path ++ # raise RuntimeError(f"memory mapped '{mmapped_path}' in process does not point to a valid {lib_name}") ++ ++ if os.path.isdir("@libhipDir@"): ++ return ["@libhipDir@"] ++ + paths = [] + + import site +diff --git a/third_party/nvidia/backend/driver.py b/third_party/nvidia/backend/driver.py +index 90f71138b..30fbadb2a 100644 +--- a/third_party/nvidia/backend/driver.py ++++ b/third_party/nvidia/backend/driver.py +@@ -21,6 +21,9 @@ def libcuda_dirs(): + if env_libcuda_path: + return [env_libcuda_path] + ++ if os.path.exists("@libcudaStubsDir@"): ++ return ["@libcudaStubsDir@"] ++ + libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode() + # each line looks like the following: + # libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1 +-- +2.46.0 + diff --git a/pkgs/development/python-modules/triton/0003-nvidia-cudart-a-systempath.patch b/pkgs/development/python-modules/triton/0003-nvidia-cudart-a-systempath.patch new file mode 100644 index 0000000000000..144d84e151fe1 --- /dev/null +++ b/pkgs/development/python-modules/triton/0003-nvidia-cudart-a-systempath.patch @@ -0,0 +1,46 @@ +From 6f92d54e5a544bc34bb07f2808d554a71cc0e4c3 Mon Sep 17 00:00:00 2001 +From: SomeoneSerge +Date: Sun, 13 Oct 2024 14:30:19 +0000 +Subject: [PATCH 3/3] nvidia: cudart a systempath + +--- + third_party/nvidia/backend/driver.c | 2 +- + third_party/nvidia/backend/driver.py | 5 +++-- + 2 files changed, 4 insertions(+), 3 deletions(-) + +diff --git a/third_party/nvidia/backend/driver.c b/third_party/nvidia/backend/driver.c +index 44524da27..fbdf0d156 100644 +--- a/third_party/nvidia/backend/driver.c ++++ b/third_party/nvidia/backend/driver.c +@@ -1,4 +1,4 @@ +-#include "cuda.h" ++#include + #include + #include + #define PY_SSIZE_T_CLEAN +diff --git a/third_party/nvidia/backend/driver.py b/third_party/nvidia/backend/driver.py +index 30fbadb2a..65c0562ed 100644 +--- a/third_party/nvidia/backend/driver.py ++++ b/third_party/nvidia/backend/driver.py +@@ -10,7 +10,8 @@ from triton.backends.compiler import GPUTarget + from triton.backends.driver import GPUDriver + + dirname = os.path.dirname(os.path.realpath(__file__)) +-include_dir = [os.path.join(dirname, "include")] ++import shlex ++include_dir = [*shlex.split("@cudaToolkitIncludeDirs@"), os.path.join(dirname, "include")] + libdevice_dir = os.path.join(dirname, "lib") + libraries = ['cuda'] + +@@ -149,7 +150,7 @@ def make_launcher(constants, signature, ids): + # generate glue code + params = [i for i in signature.keys() if i not in constants] + src = f""" +-#include \"cuda.h\" ++#include + #include + #include + #include +-- +2.46.0 + diff --git a/pkgs/development/python-modules/triton/0004-nvidia-allow-static-ptxas-path.patch b/pkgs/development/python-modules/triton/0004-nvidia-allow-static-ptxas-path.patch new file mode 100644 index 0000000000000..eea1834d1750b --- /dev/null +++ b/pkgs/development/python-modules/triton/0004-nvidia-allow-static-ptxas-path.patch @@ -0,0 +1,26 @@ +From e503e572b6d444cd27f1cdf124aaf553aa3a8665 Mon Sep 17 00:00:00 2001 +From: SomeoneSerge +Date: Mon, 14 Oct 2024 00:12:05 +0000 +Subject: [PATCH 4/4] nvidia: allow static ptxas path + +--- + third_party/nvidia/backend/compiler.py | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py +index 6d7994923..6720e8f97 100644 +--- a/third_party/nvidia/backend/compiler.py ++++ b/third_party/nvidia/backend/compiler.py +@@ -20,6 +20,9 @@ def _path_to_binary(binary: str): + os.path.join(os.path.dirname(__file__), "bin", binary), + ] + ++ import shlex ++ paths.extend(shlex.split("@nixpkgsExtraBinaryPaths@")) ++ + for bin in paths: + if os.path.exists(bin) and os.path.isfile(bin): + result = subprocess.check_output([bin, "--version"], stderr=subprocess.STDOUT) +-- +2.46.0 + diff --git a/pkgs/development/python-modules/triton/bin.nix b/pkgs/development/python-modules/triton/bin.nix index 0189278bc0e9b..6bb67753a8bd7 100644 --- a/pkgs/development/python-modules/triton/bin.nix +++ b/pkgs/development/python-modules/triton/bin.nix @@ -5,11 +5,8 @@ cudaPackages, buildPythonPackage, fetchurl, - isPy38, - isPy39, - isPy310, - isPy311, python, + pythonOlder, autoPatchelfHook, filelock, lit, @@ -29,7 +26,7 @@ buildPythonPackage rec { in fetchurl srcs; - disabled = !(isPy38 || isPy39 || isPy310 || isPy311); + disabled = pythonOlder "3.8"; pythonRemoveDeps = [ "cmake" diff --git a/pkgs/development/python-modules/triton/default.nix b/pkgs/development/python-modules/triton/default.nix index 98585b850e912..6e4c66e4acea6 100644 --- a/pkgs/development/python-modules/triton/default.nix +++ b/pkgs/development/python-modules/triton/default.nix @@ -1,111 +1,92 @@ { lib, - config, addDriverRunpath, buildPythonPackage, - fetchFromGitHub, - fetchpatch, - setuptools, cmake, - ninja, - pybind11, + config, + cudaPackages, + fetchFromGitHub, + filelock, gtest, - zlib, - ncurses, libxml2, lit, llvm, - filelock, - torchWithRocm, + ncurses, + ninja, + pybind11, python, - - runCommand, - - cudaPackages, + pytestCheckHook, + stdenv, + substituteAll, + setuptools, + torchWithRocm, + zlib, cudaSupport ? config.cudaSupport, + rocmSupport ? config.rocmSupport, + rocmPackages, + triton, }: -let - ptxas = lib.getExe' cudaPackages.cuda_nvcc "ptxas"; # Make sure cudaPackages is the right version each update (See python/setup.py) -in -buildPythonPackage rec { +buildPythonPackage { pname = "triton"; - version = "2.1.0"; + version = "3.0.0"; pyproject = true; src = fetchFromGitHub { - owner = "openai"; - repo = pname; - rev = "v${version}"; - hash = "sha256-8UTUwLH+SriiJnpejdrzz9qIquP2zBp1/uwLdHmv0XQ="; + owner = "triton-lang"; + repo = "triton"; + # latest branch commit from https://github.com/triton-lang/triton/commits/release/3.0.x/ + rev = "91f24d87e50cb748b121a6c24e65a01187699c22"; + hash = "sha256-L5KqiR+TgSyKjEBlkE0yOU1pemMHFk2PhEmxLdbbxUU="; }; patches = [ - # fix overflow error - (fetchpatch { - url = "https://github.com/openai/triton/commit/52c146f66b79b6079bcd28c55312fc6ea1852519.patch"; - hash = "sha256-098/TCQrzvrBAbQiaVGCMaF3o5Yc3yWDxzwSkzIuAtY="; + ./0001-setup.py-introduce-TRITON_OFFLINE_BUILD.patch + (substituteAll { + src = ./0001-_build-allow-extra-cc-flags.patch; + ccCmdExtraFlags = "-Wl,-rpath,${addDriverRunpath.driverLink}/lib"; }) - - # Upstream startded pinning CUDA version and falling back to downloading from Conda - # in https://github.com/triton-lang/triton/pull/1574/files#diff-eb8b42d9346d0a5d371facf21a8bfa2d16fb49e213ae7c21f03863accebe0fcfR120-R123 - ./0000-dont-download-ptxas.patch + (substituteAll ( + { + src = ./0002-nvidia-amd-driver-short-circuit-before-ldconfig.patch; + } + // lib.optionalAttrs rocmSupport { libhipDir = "${lib.getLib rocmPackages.clr}/lib"; } + // lib.optionalAttrs cudaSupport { + libcudaStubsDir = "${lib.getLib cudaPackages.cuda_cudart}/lib/stubs"; + ccCmdExtraFlags = "-Wl,-rpath,${addDriverRunpath.driverLink}/lib"; + } + )) ] - ++ lib.optionals (!cudaSupport) [ - # triton wants to get ptxas version even if ptxas is not - # used, resulting in ptxas not found error. - ./0001-ptxas-disable-version-key-for-non-cuda-targets.patch + ++ lib.optionals cudaSupport [ + (substituteAll { + src = ./0003-nvidia-cudart-a-systempath.patch; + cudaToolkitIncludeDirs = "${lib.getInclude cudaPackages.cuda_cudart}/include"; + }) + (substituteAll { + src = ./0004-nvidia-allow-static-ptxas-path.patch; + nixpkgsExtraBinaryPaths = lib.escapeShellArgs [ (lib.getExe' cudaPackages.cuda_nvcc "ptxas") ]; + }) ]; - postPatch = - let - quote = x: ''"${x}"''; - subs.ldFlags = - let - # Bash was getting weird without linting, - # but basically upstream contains [cc, ..., "-lcuda", ...] - # and we replace it with [..., "-lcuda", "-L/run/opengl-driver/lib", "-L$stubs", ...] - old = [ "-lcuda" ]; - new = [ - "-lcuda" - "-L${addDriverRunpath.driverLink}" - "-L${cudaPackages.cuda_cudart}/lib/stubs/" - ]; - in - { - oldStr = lib.concatMapStringsSep ", " quote old; - newStr = lib.concatMapStringsSep ", " quote new; - }; - in - '' - # Use our `cmakeFlags` instead and avoid downloading dependencies - substituteInPlace python/setup.py \ - --replace "= get_thirdparty_packages(triton_cache_path)" "= os.environ[\"cmakeFlags\"].split()" - - # Already defined in llvm, when built with -DLLVM_INSTALL_UTILS - substituteInPlace bin/CMakeLists.txt \ - --replace "add_subdirectory(FileCheck)" "" - - # Don't fetch googletest - substituteInPlace unittest/CMakeLists.txt \ - --replace "include (\''${CMAKE_CURRENT_SOURCE_DIR}/googletest.cmake)" ""\ - --replace "include(GoogleTest)" "find_package(GTest REQUIRED)" - - cat << \EOF >> python/triton/common/build.py - def libcuda_dirs(): - return [ "${addDriverRunpath.driverLink}/lib" ] - EOF - '' - + lib.optionalString cudaSupport '' - # Use our linker flags - substituteInPlace python/triton/common/build.py \ - --replace '${subs.ldFlags.oldStr}' '${subs.ldFlags.newStr}' - ''; + postPatch = '' + # Use our `cmakeFlags` instead and avoid downloading dependencies + # remove any downloads + substituteInPlace python/setup.py \ + --replace-fail "get_json_package_info(), get_pybind11_package_info()" ""\ + --replace-fail "get_pybind11_package_info(), get_llvm_package_info()" ""\ + --replace-fail 'packages += ["triton/profiler"]' ""\ + --replace-fail "curr_version != version" "False" + + # Don't fetch googletest + substituteInPlace unittest/CMakeLists.txt \ + --replace-fail "include (\''${CMAKE_CURRENT_SOURCE_DIR}/googletest.cmake)" ""\ + --replace-fail "include(GoogleTest)" "find_package(GTest REQUIRED)" + ''; + + build-system = [ setuptools ]; nativeBuildInputs = [ - setuptools - # pytestCheckHook # Requires torch (circular dependency) and probably needs GPUs: cmake ninja @@ -125,7 +106,7 @@ buildPythonPackage rec { zlib ]; - propagatedBuildInputs = [ + dependencies = [ filelock # triton uses setuptools at runtime: # https://github.com/NixOS/nixpkgs/pull/286763/#discussion_r1480392652 @@ -139,91 +120,160 @@ buildPythonPackage rec { ]; # Avoid GLIBCXX mismatch with other cuda-enabled python packages - preConfigure = - '' - # Ensure that the build process uses the requested number of cores - export MAX_JOBS="$NIX_BUILD_CORES" - - # Upstream's setup.py tries to write cache somewhere in ~/ - export HOME=$(mktemp -d) - - # Upstream's github actions patch setup.cfg to write base-dir. May be redundant - echo " - [build_ext] - base-dir=$PWD" >> python/setup.cfg - - # The rest (including buildPhase) is relative to ./python/ - cd python - '' - + lib.optionalString cudaSupport '' - export CC=${cudaPackages.backendStdenv.cc}/bin/cc; - export CXX=${cudaPackages.backendStdenv.cc}/bin/c++; - - # Work around download_and_copy_ptxas() - mkdir -p $PWD/triton/third_party/cuda/bin - ln -s ${ptxas} $PWD/triton/third_party/cuda/bin - ''; + preConfigure = '' + # Ensure that the build process uses the requested number of cores + export MAX_JOBS="$NIX_BUILD_CORES" - # CMake is run by setup.py instead - dontUseCmakeConfigure = true; + # Upstream's setup.py tries to write cache somewhere in ~/ + export HOME=$(mktemp -d) + + # Upstream's github actions patch setup.cfg to write base-dir. May be redundant + echo " + [build_ext] + base-dir=$PWD" >> python/setup.cfg - # Setuptools (?) strips runpath and +x flags. Let's just restore the symlink - postFixup = lib.optionalString cudaSupport '' - rm -f $out/${python.sitePackages}/triton/third_party/cuda/bin/ptxas - ln -s ${ptxas} $out/${python.sitePackages}/triton/third_party/cuda/bin/ptxas + # The rest (including buildPhase) is relative to ./python/ + cd python ''; - checkInputs = [ cmake ]; # ctest - dontUseSetuptoolsCheck = true; + env = + { + TRITON_BUILD_PROTON = "OFF"; + TRITON_OFFLINE_BUILD = true; + } + // lib.optionalAttrs cudaSupport { + CC = lib.getExe' cudaPackages.backendStdenv.cc "cc"; + CXX = lib.getExe' cudaPackages.backendStdenv.cc "c++"; + + # TODO: Unused because of how TRITON_OFFLINE_BUILD currently works (subject to change) + TRITON_PTXAS_PATH = lib.getExe' cudaPackages.cuda_nvcc "ptxas"; # Make sure cudaPackages is the right version each update (See python/setup.py) + TRITON_CUOBJDUMP_PATH = lib.getExe' cudaPackages.cuda_cuobjdump "cuobjdump"; + TRITON_NVDISASM_PATH = lib.getExe' cudaPackages.cuda_nvdisasm "nvdisasm"; + TRITON_CUDACRT_PATH = lib.getInclude cudaPackages.cuda_nvcc; + TRITON_CUDART_PATH = lib.getInclude cudaPackages.cuda_cudart; + TRITON_CUPTI_PATH = cudaPackages.cuda_cupti; + }; + + pythonRemoveDeps = [ + # Circular dependency, cf. https://github.com/triton-lang/triton/issues/1374 + "torch" + + # CLI tools without dist-info + "cmake" + "lit" + ]; + + # CMake is run by setup.py instead + dontUseCmakeConfigure = true; + nativeCheckInputs = [ cmake ]; preCheck = '' # build/temp* refers to build_ext.build_temp (looked up in the build logs) (cd ./build/temp* ; ctest) - - # For pytestCheckHook - cd test/unit ''; - # Circular dependency on torch - # pythonImportsCheck = [ - # "triton" - # "triton.language" - # ]; + pythonImportsCheck = [ + "triton" + "triton.language" + ]; + + passthru.gpuCheck = stdenv.mkDerivation { + pname = "triton-pytest"; + inherit (triton) version src; + + requiredSystemFeatures = [ "cuda" ]; + + nativeBuildInputs = [ + (python.withPackages (ps: [ + ps.scipy + ps.torchWithCuda + ps.triton-cuda + ])) + ]; + + dontBuild = true; + nativeCheckInputs = [ pytestCheckHook ]; + + doCheck = true; + + preCheck = '' + cd python/test/unit + export HOME=$TMPDIR + ''; + checkPhase = "pytestCheckPhase"; + + installPhase = "touch $out"; + }; - # Ultimately, torch is our test suite: passthru.tests = { + # Ultimately, torch is our test suite: inherit torchWithRocm; - # Implemented as alternative to pythonImportsCheck, in case if circular dependency on torch occurs again, - # and pythonImportsCheck is commented back. - import-triton = - runCommand "import-triton" - { nativeBuildInputs = [ (python.withPackages (ps: [ ps.triton ])) ]; } + + # Test as `nix run -f "" python3Packages.triton.tests.axpy-cuda` + # or, using `programs.nix-required-mounts`, as `nix build -f "" python3Packages.triton.tests.axpy-cuda.gpuCheck` + axpy-cuda = + cudaPackages.writeGpuTestPython + { + libraries = ps: [ + ps.triton + ps.torch-no-triton + ]; + } '' - python << \EOF + # Adopted from Philippe Tillet https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html + import triton - import triton.language - EOF - touch "$out" - ''; - }; + import triton.language as tl + import torch + import os - pythonRemoveDeps = [ - # Circular dependency, cf. https://github.com/openai/triton/issues/1374 - "torch" + @triton.jit + def axpy_kernel(n, a: tl.constexpr, x_ptr, y_ptr, out, BLOCK_SIZE: tl.constexpr): + pid = tl.program_id(axis=0) + block_start = pid * BLOCK_SIZE + offsets = block_start + tl.arange(0, BLOCK_SIZE) + mask = offsets < n + x = tl.load(x_ptr + offsets, mask=mask) + y = tl.load(y_ptr + offsets, mask=mask) + output = a * x + y + tl.store(out + offsets, output, mask=mask) - # CLI tools without dist-info - "cmake" - "lit" - ]; + def axpy(a, x, y): + output = torch.empty_like(x) + assert x.is_cuda and y.is_cuda and output.is_cuda + n_elements = output.numel() + + def grid(meta): + return (triton.cdiv(n_elements, meta['BLOCK_SIZE']), ) + + axpy_kernel[grid](n_elements, a, x, y, output, BLOCK_SIZE=1024) + return output + + if __name__ == "__main__": + if os.environ.get("HOME", None) == "/homeless-shelter": + os.environ["HOME"] = os.environ.get("TMPDIR", "/tmp") + if "CC" not in os.environ: + os.environ["CC"] = "${lib.getExe' cudaPackages.backendStdenv.cc "cc"}" + torch.manual_seed(0) + size = 12345 + x = torch.rand(size, device='cuda') + y = torch.rand(size, device='cuda') + output_torch = 3.14 * x + y + output_triton = axpy(3.14, x, y) + assert output_torch.sub(output_triton).abs().max().item() < 1e-6 + print("Triton axpy: OK") + ''; + }; meta = with lib; { description = "Language and compiler for writing highly efficient custom Deep-Learning primitives"; - homepage = "https://github.com/openai/triton"; + homepage = "https://github.com/triton-lang/triton"; platforms = platforms.linux; license = licenses.mit; maintainers = with maintainers; [ SomeoneSerge Madouura + derdennisop ]; }; } diff --git a/pkgs/development/python-modules/triton/prefetch.sh b/pkgs/development/python-modules/triton/prefetch.sh deleted file mode 100755 index f218718a5cf30..0000000000000 --- a/pkgs/development/python-modules/triton/prefetch.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env nix-shell -#!nix-shell -i bash -p nix-prefetch-scripts - -set -eou pipefail - -version=$1 - -linux_bucket="https://download.pytorch.org/whl" - -url_and_key_list=( - "x86_64-linux-38 $linux_bucket/triton-${version}-0-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl triton-${version}-cp38-cp38-linux_x86_64.whl" - "x86_64-linux-39 $linux_bucket/triton-${version}-0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl triton-${version}-cp39-cp39-linux_x86_64.whl" - "x86_64-linux-310 $linux_bucket/triton-${version}-0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl triton-${version}-cp310-cp310-linux_x86_64.whl" - "x86_64-linux-311 $linux_bucket/triton-${version}-0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl triton-${version}-cp311-cp311-linux_x86_64.whl" -) - -hashfile=binary-hashes-"$version".nix -echo " \"$version\" = {" >> $hashfile - -for url_and_key in "${url_and_key_list[@]}"; do - key=$(echo "$url_and_key" | cut -d' ' -f1) - url=$(echo "$url_and_key" | cut -d' ' -f2) - name=$(echo "$url_and_key" | cut -d' ' -f3) - - echo "prefetching ${url}..." - hash=$(nix hash to-sri --type sha256 `nix-prefetch-url "$url" --name "$name"`) - - cat << EOF >> $hashfile - $key = { - name = "$name"; - url = "$url"; - hash = "$hash"; - }; -EOF - - echo -done - -echo " };" >> $hashfile -echo "done." diff --git a/pkgs/top-level/python-packages.nix b/pkgs/top-level/python-packages.nix index c026d04370f7f..b259dff2b346d 100644 --- a/pkgs/top-level/python-packages.nix +++ b/pkgs/top-level/python-packages.nix @@ -15717,10 +15717,10 @@ self: super: with self; { toposort = callPackage ../development/python-modules/toposort { }; - torch = callPackage ../development/python-modules/torch { - inherit (pkgs.darwin.apple_sdk.frameworks) Accelerate CoreServices; - inherit (pkgs.darwin) libobjc; - }; + torch = callPackage ../development/python-modules/torch { }; + + # Required to test triton + torch-no-triton = self.torch.override { tritonSupport = false; }; torch-audiomentations = callPackage ../development/python-modules/torch-audiomentations { };