diff --git a/pkgs/by-name/tr/triton-llvm/package.nix b/pkgs/by-name/tr/triton-llvm/package.nix
index d45aa2fafe65f..829f0c912afd2 100644
--- a/pkgs/by-name/tr/triton-llvm/package.nix
+++ b/pkgs/by-name/tr/triton-llvm/package.nix
@@ -1,6 +1,7 @@
 { lib
 , stdenv
 , fetchFromGitHub
+, fetchpatch
 , pkgsBuildBuild
 , pkg-config
 , cmake
@@ -11,6 +12,7 @@
 , libedit
 , libffi
 , libpfm
+, lit
 , mpfr
 , zlib
 , ncurses
@@ -45,7 +47,7 @@ let
   isNative = stdenv.hostPlatform == stdenv.buildPlatform;
 in stdenv.mkDerivation (finalAttrs: {
   pname = "triton-llvm";
-  version = "17.0.0-c5dede880d17";
+  version = "19.1.0-rc1"; # One of the tags at https://github.com/llvm/llvm-project/commit/10dc3a8e916d73291269e5e2b82dd22681489aa1
 
   outputs = [
     "out"
@@ -60,9 +62,18 @@ in stdenv.mkDerivation (finalAttrs: {
   src = fetchFromGitHub {
     owner = "llvm";
     repo = "llvm-project";
-    rev = "c5dede880d175f7229c9b2923f4753e12702305d";
-    hash = "sha256-v4r3+7XVFK+Dzxt/rErZNJ9REqFO3JmGN4X4vZ+77ew=";
+    rev = "10dc3a8e916d73291269e5e2b82dd22681489aa1";
+    hash = "sha256-9DPvcFmhzw6MipQeCQnr35LktW0uxtEL8axMMPXIfWw=";
   };
+  patches = [
+    # glibc-2.40 support
+    # [llvm-exegesis] Use correct rseq struct size #100804
+    # https://github.com/llvm/llvm-project/issues/100791
+    (fetchpatch {
+      url = "https://github.com/llvm/llvm-project//commit/84837e3cc1cf17ed71580e3ea38299ed2bfaa5f6.patch";
+      hash = "sha256-QKa+kyXjjGXwTQTEpmKZx5yYjOyBX8A8NQoIYUaGcIw=";
+    })
+  ];
 
   nativeBuildInputs = [
     pkg-config
@@ -74,6 +85,7 @@ in stdenv.mkDerivation (finalAttrs: {
     doxygen
     sphinx
     python3Packages.recommonmark
+    python3Packages.myst-parser
   ];
 
   buildInputs = [
@@ -90,7 +102,9 @@ in stdenv.mkDerivation (finalAttrs: {
     ncurses
   ];
 
-  sourceRoot = "${finalAttrs.src.name}/llvm";
+  preConfigure = ''
+    cd llvm
+  '';
 
   cmakeFlags = [
     (lib.cmakeFeature "LLVM_TARGETS_TO_BUILD" (lib.concatStringsSep ";" llvmTargetsToBuild'))
@@ -140,23 +154,25 @@ in stdenv.mkDerivation (finalAttrs: {
 
   postPatch = ''
     # `CMake Error: cannot write to file "/build/source/llvm/build/lib/cmake/mlir/MLIRTargets.cmake": Permission denied`
-    chmod +w -R ../mlir
-    patchShebangs ../mlir/test/mlir-reduce
+    chmod +w -R ./mlir
+    patchShebangs ./mlir/test/mlir-reduce
 
     # FileSystem permissions tests fail with various special bits
-    rm test/tools/llvm-objcopy/ELF/mirror-permissions-unix.test
-    rm unittests/Support/Path.cpp
+    rm llvm/test/tools/llvm-objcopy/ELF/mirror-permissions-unix.test
+    rm llvm/unittests/Support/Path.cpp
 
-    substituteInPlace unittests/Support/CMakeLists.txt \
+    substituteInPlace llvm/unittests/Support/CMakeLists.txt \
       --replace "Path.cpp" ""
   '' + lib.optionalString stdenv.hostPlatform.isAarch64 ''
     # Not sure why this fails
-    rm test/tools/llvm-exegesis/AArch64/latency-by-opcode-name.s
+    rm llvm/test/tools/llvm-exegesis/AArch64/latency-by-opcode-name.s
   '';
 
-  postInstall = lib.optionalString (!isNative) ''
+  postInstall = ''
+    cp ${lib.getExe lit} $out/bin/llvm-lit
+  '' + (lib.optionalString (!isNative) ''
     cp -a NATIVE/bin/llvm-config $out/bin/llvm-config-native
-  '';
+  '');
 
   doCheck = buildTests;
 
diff --git a/pkgs/development/python-modules/torch/0001-cmake.py-propagate-cmakeFlags-from-environment.patch b/pkgs/development/python-modules/torch/0001-cmake.py-propagate-cmakeFlags-from-environment.patch
new file mode 100644
index 0000000000000..e30f6449c7bc5
--- /dev/null
+++ b/pkgs/development/python-modules/torch/0001-cmake.py-propagate-cmakeFlags-from-environment.patch
@@ -0,0 +1,29 @@
+From c5d4087519eae6f41c80bbd8ffbcc9390db44c7f Mon Sep 17 00:00:00 2001
+From: SomeoneSerge <else+aalto@someonex.net>
+Date: Thu, 10 Oct 2024 19:19:18 +0000
+Subject: [PATCH] cmake.py: propagate cmakeFlags from environment
+
+---
+ tools/setup_helpers/cmake.py | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
+index 4b605fe5975..ea1d6a1ef46 100644
+--- a/tools/setup_helpers/cmake.py
++++ b/tools/setup_helpers/cmake.py
+@@ -332,6 +332,12 @@ class CMake:
+                         file=sys.stderr,
+                     )
+                     print(e, file=sys.stderr)
++
++        # Nixpkgs compat:
++        if "cmakeFlags" in os.environ:
++            import shlex
++            args.extend(shlex.split(os.environ["cmakeFlags"]))
++
+         # According to the CMake manual, we should pass the arguments first,
+         # and put the directory as the last element. Otherwise, these flags
+         # may not be passed correctly.
+-- 
+2.46.0
+
diff --git a/pkgs/development/python-modules/torch/default.nix b/pkgs/development/python-modules/torch/default.nix
index 7b5b8e9f6726c..00a2a66072677 100644
--- a/pkgs/development/python-modules/torch/default.nix
+++ b/pkgs/development/python-modules/torch/default.nix
@@ -35,10 +35,8 @@
   removeReferencesTo,
 
   # Build inputs
+  darwin,
   numactl,
-  Accelerate,
-  CoreServices,
-  libobjc,
 
   # Propagated build inputs
   astunparse,
@@ -56,6 +54,17 @@
   tritonSupport ? (!stdenv.hostPlatform.isDarwin),
   triton,
 
+  # TODO: 1. callPackage needs to learn to distinguish between the task
+  #          of "asking for an attribute from the parent scope" and
+  #          the task of "exposing a formal parameter in .override".
+  # TODO: 2. We should probably abandon attributes such as `torchWithCuda` (etc.)
+  #          as they routinely end up consuming the wrong arguments\
+  #          (dependencies without cuda support).
+  #          Instead we should rely on overlays and nixpkgsFun.
+  # (@SomeoneSerge)
+  _tritonEffective ? if cudaSupport then triton-cuda else triton,
+  triton-cuda,
+
   # Unit tests
   hypothesis,
   psutil,
@@ -95,6 +104,8 @@ let
     ;
   inherit (cudaPackages) cudaFlags cudnn nccl;
 
+  triton = throw "python3Packages.torch: use _tritonEffective instead of triton to avoid divergence";
+
   rocmPackages = rocmPackages_5;
 
   setBool = v: if v then "1" else "0";
@@ -240,6 +251,7 @@ buildPythonPackage rec {
       # Allow setting PYTHON_LIB_REL_PATH with an environment variable.
       # https://github.com/pytorch/pytorch/pull/128419
       ./passthrough-python-lib-rel-path.patch
+      ./0001-cmake.py-propagate-cmakeFlags-from-environment.patch
     ]
     ++ lib.optionals cudaSupport [ ./fix-cmake-cuda-toolkit.patch ]
     ++ lib.optionals (stdenv.hostPlatform.isDarwin && stdenv.hostPlatform.isx86_64) [
@@ -257,7 +269,18 @@ buildPythonPackage rec {
     ];
 
   postPatch =
-    lib.optionalString rocmSupport ''
+    ''
+      substituteInPlace cmake/public/cuda.cmake \
+        --replace-fail \
+          'message(FATAL_ERROR "Found two conflicting CUDA' \
+          'message(WARNING "Found two conflicting CUDA' \
+        --replace-warn \
+          "set(CUDAToolkit_ROOT" \
+          "# Upstream: set(CUDAToolkit_ROOT"
+      substituteInPlace third_party/gloo/cmake/Cuda.cmake \
+        --replace-warn "find_package(CUDAToolkit 7.0" "find_package(CUDAToolkit"
+    ''
+    + lib.optionalString rocmSupport ''
       # https://github.com/facebookincubator/gloo/pull/297
       substituteInPlace third_party/gloo/cmake/Hipify.cmake \
         --replace "\''${HIPIFY_COMMAND}" "python \''${HIPIFY_COMMAND}"
@@ -351,6 +374,17 @@ buildPythonPackage rec {
   # NB technical debt: building without NNPACK as workaround for missing `six`
   USE_NNPACK = 0;
 
+  cmakeFlags =
+    [
+      # (lib.cmakeBool "CMAKE_FIND_DEBUG_MODE" true)
+      (lib.cmakeFeature "CUDAToolkit_VERSION" cudaPackages.cudaVersion)
+    ]
+    ++ lib.optionals cudaSupport [
+      # Unbreaks version discovery in enable_language(CUDA) when wrapping nvcc with ccache
+      # Cf. https://gitlab.kitware.com/cmake/cmake/-/issues/26363
+      (lib.cmakeFeature "CMAKE_CUDA_COMPILER_TOOLKIT_VERSION" cudaPackages.cudaVersion)
+    ];
+
   preBuild = ''
     export MAX_JOBS=$NIX_BUILD_CORES
     ${python.pythonOnBuildForHost.interpreter} setup.py build --cmake-only
@@ -495,11 +529,11 @@ buildPythonPackage rec {
     ++ lib.optionals (cudaSupport || rocmSupport) [ effectiveMagma ]
     ++ lib.optionals stdenv.hostPlatform.isLinux [ numactl ]
     ++ lib.optionals stdenv.hostPlatform.isDarwin [
-      Accelerate
-      CoreServices
-      libobjc
+      darwin.apple_sdk.frameworks.Accelerate
+      darwin.apple_sdk.frameworks.CoreServices
+      darwin.libobjc
     ]
-    ++ lib.optionals tritonSupport [ triton ]
+    ++ lib.optionals tritonSupport [ _tritonEffective ]
     ++ lib.optionals MPISupport [ mpi ]
     ++ lib.optionals rocmSupport [ rocmtoolkit_joined ];
 
@@ -527,7 +561,7 @@ buildPythonPackage rec {
 
     # torch/csrc requires `pybind11` at runtime
     pybind11
-  ] ++ lib.optionals tritonSupport [ triton ];
+  ] ++ lib.optionals tritonSupport [ _tritonEffective ];
 
   propagatedCxxBuildInputs =
     [ ] ++ lib.optionals MPISupport [ mpi ] ++ lib.optionals rocmSupport [ rocmtoolkit_joined ];
@@ -662,7 +696,9 @@ buildPythonPackage rec {
       thoughtpolice
       tscholak
     ]; # tscholak esp. for darwin-related builds
-    platforms = with lib.platforms; linux ++ lib.optionals (!cudaSupport && !rocmSupport) darwin;
+    platforms =
+      lib.platforms.linux
+      ++ lib.optionals (!cudaSupport && !rocmSupport) lib.platforms.darwin;
     broken = builtins.any trivial.id (builtins.attrValues brokenConditions);
   };
 }
diff --git a/pkgs/development/python-modules/triton/0000-dont-download-ptxas.patch b/pkgs/development/python-modules/triton/0000-dont-download-ptxas.patch
deleted file mode 100644
index d31a4798af05c..0000000000000
--- a/pkgs/development/python-modules/triton/0000-dont-download-ptxas.patch
+++ /dev/null
@@ -1,15 +0,0 @@
-diff --git a/python/setup.py b/python/setup.py
-index 18764ec13..b3bb5b60a 100644
---- a/python/setup.py
-+++ b/python/setup.py
-@@ -269,10 +269,6 @@ class CMakeBuild(build_ext):
-         subprocess.check_call(["cmake", self.base_dir] + cmake_args, cwd=cmake_dir, env=env)
-         subprocess.check_call(["cmake", "--build", "."] + build_args, cwd=cmake_dir)
- 
--
--download_and_copy_ptxas()
--
--
- setup(
-     name="triton",
-     version="2.1.0",
diff --git a/pkgs/development/python-modules/triton/0001-_build-allow-extra-cc-flags.patch b/pkgs/development/python-modules/triton/0001-_build-allow-extra-cc-flags.patch
new file mode 100644
index 0000000000000..1e473dc59f46a
--- /dev/null
+++ b/pkgs/development/python-modules/triton/0001-_build-allow-extra-cc-flags.patch
@@ -0,0 +1,35 @@
+From 2751c5de5c61c90b56e3e392a41847f4c47258fd Mon Sep 17 00:00:00 2001
+From: SomeoneSerge <else+aalto@someonex.net>
+Date: Sun, 13 Oct 2024 14:16:48 +0000
+Subject: [PATCH 1/3] _build: allow extra cc flags
+
+---
+ python/triton/runtime/build.py | 10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+diff --git a/python/triton/runtime/build.py b/python/triton/runtime/build.py
+index d7baeb286..d334dce77 100644
+--- a/python/triton/runtime/build.py
++++ b/python/triton/runtime/build.py
+@@ -42,9 +42,17 @@ def _build(name, src, srcdir, library_dirs, include_dirs, libraries):
+     py_include_dir = sysconfig.get_paths(scheme=scheme)["include"]
+     include_dirs = include_dirs + [srcdir, py_include_dir]
+     cc_cmd = [cc, src, "-O3", "-shared", "-fPIC", "-o", so]
++
++    # Nixpkgs support branch
++    # Allows passing e.g. extra -Wl,-rpath
++    cc_cmd_extra_flags = "@ccCmdExtraFlags@"
++    if cc_cmd_extra_flags != ("@" + "ccCmdExtraFlags@"): # substituteAll hack
++        import shlex
++        cc_cmd.extend(shlex.split(cc_cmd_extra_flags))
++
+     cc_cmd += [f'-l{lib}' for lib in libraries]
+     cc_cmd += [f"-L{dir}" for dir in library_dirs]
+-    cc_cmd += [f"-I{dir}" for dir in include_dirs]
++    cc_cmd += [f"-I{dir}" for dir in include_dirs if dir is not None]
+     ret = subprocess.check_call(cc_cmd)
+     if ret == 0:
+         return so
+-- 
+2.46.0
+
diff --git a/pkgs/development/python-modules/triton/0001-ptxas-disable-version-key-for-non-cuda-targets.patch b/pkgs/development/python-modules/triton/0001-ptxas-disable-version-key-for-non-cuda-targets.patch
deleted file mode 100644
index 3941d54b8b37f..0000000000000
--- a/pkgs/development/python-modules/triton/0001-ptxas-disable-version-key-for-non-cuda-targets.patch
+++ /dev/null
@@ -1,27 +0,0 @@
-From 10f3d49aa6084d1b9b9624017cce7df106b9fb7e Mon Sep 17 00:00:00 2001
-From: Yaroslav Bolyukin <iam@lach.pw>
-Date: Tue, 6 Feb 2024 13:51:28 +0100
-Subject: [PATCH] ptxas: disable version key for non-cuda targets
-
----
- python/triton/runtime/jit.py | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/python/triton/runtime/jit.py b/python/triton/runtime/jit.py
-index d55972b4b..bd875a701 100644
---- a/python/triton/runtime/jit.py
-+++ b/python/triton/runtime/jit.py
-@@ -117,8 +117,8 @@ def version_key():
-         with open(lib.module_finder.find_spec(lib.name).origin, "rb") as f:
-             contents += [hashlib.md5(f.read()).hexdigest()]
-     # ptxas version
--    ptxas = path_to_ptxas()[0]
--    ptxas_version = hashlib.md5(subprocess.check_output([ptxas, "--version"])).hexdigest()
-+    # ptxas = path_to_ptxas()[0]
-+    ptxas_version = "noptxas"
-     return '-'.join(TRITON_VERSION) + '-' + ptxas_version + '-' + '-'.join(contents)
- 
- 
--- 
-2.43.0
-
diff --git a/pkgs/development/python-modules/triton/0001-setup.py-introduce-TRITON_OFFLINE_BUILD.patch b/pkgs/development/python-modules/triton/0001-setup.py-introduce-TRITON_OFFLINE_BUILD.patch
new file mode 100644
index 0000000000000..5b195fd7f8821
--- /dev/null
+++ b/pkgs/development/python-modules/triton/0001-setup.py-introduce-TRITON_OFFLINE_BUILD.patch
@@ -0,0 +1,64 @@
+From 587d1f3428eca63544238802f19e0be670d03244 Mon Sep 17 00:00:00 2001
+From: SomeoneSerge <else@someonex.net>
+Date: Mon, 29 Jul 2024 14:31:11 +0000
+Subject: [PATCH] setup.py: introduce TRITON_OFFLINE_BUILD
+
+To prevent any vendoring whatsoever
+---
+ python/setup.py | 26 ++++++++++++++++++++++++--
+ 1 file changed, 24 insertions(+), 2 deletions(-)
+
+diff --git a/python/setup.py b/python/setup.py
+index 73800ec40..4e5b04de4 100644
+--- a/python/setup.py
++++ b/python/setup.py
+@@ -112,6 +112,20 @@ def get_env_with_keys(key: list):
+             return os.environ[k]
+     return ""
+ 
++def is_offline_build() -> bool:
++    """
++    Downstream projects and distributions which bootstrap their own dependencies from scratch
++    and run builds in offline sandboxes
++    may set `TRITON_OFFLINE_BUILD` in the build environment to prevent any attempts at downloading
++    pinned dependencies from the internet or at using dependencies vendored in-tree.
++
++    Dependencies must be defined using respective search paths (cf. `syspath_var_name` in `Package`).
++    Missing dependencies lead to an early abortion.
++    Dependencies' compatibility is not verified.
++
++    Note that this flag isn't tested by the CI and does not provide any guarantees.
++    """
++    return os.environ.get("TRITON_OFFLINE_BUILD", "") != ""
+ 
+ # --- third party packages -----
+ 
+@@ -220,8 +234,14 @@ def get_thirdparty_packages(packages: list):
+         if os.environ.get(p.syspath_var_name):
+             package_dir = os.environ[p.syspath_var_name]
+         version_file_path = os.path.join(package_dir, "version.txt")
+-        if p.syspath_var_name not in os.environ and\
+-           (not os.path.exists(version_file_path) or Path(version_file_path).read_text() != p.url):
++
++        input_defined = p.syspath_var_name not in os.environ
++        input_exists = input_defined and os.path.exists(version_file_path)
++        input_compatible = input_exists and Path(version_file_path).read_text() == p.url
++
++        if is_offline_build() and not input_defined:
++            raise RuntimeError(f"Requested an offline build but {p.syspath_var_name} is not set")
++        if not is_offline_build() and not input_compatible:
+             with contextlib.suppress(Exception):
+                 shutil.rmtree(package_root_dir)
+             os.makedirs(package_root_dir, exist_ok=True)
+@@ -245,6 +265,8 @@ def get_thirdparty_packages(packages: list):
+ 
+ 
+ def download_and_copy(name, src_path, variable, version, url_func):
++    if is_offline_build():
++        return
+     triton_cache_path = get_triton_cache_path()
+     if variable in os.environ:
+         return
+-- 
+2.45.1
+
diff --git a/pkgs/development/python-modules/triton/0002-nvidia-amd-driver-short-circuit-before-ldconfig.patch b/pkgs/development/python-modules/triton/0002-nvidia-amd-driver-short-circuit-before-ldconfig.patch
new file mode 100644
index 0000000000000..aa65cad58ed81
--- /dev/null
+++ b/pkgs/development/python-modules/triton/0002-nvidia-amd-driver-short-circuit-before-ldconfig.patch
@@ -0,0 +1,70 @@
+From 7407cb03eec82768e333909d87b7668b633bfe86 Mon Sep 17 00:00:00 2001
+From: SomeoneSerge <else+aalto@someonex.net>
+Date: Sun, 13 Oct 2024 14:28:48 +0000
+Subject: [PATCH 2/3] {nvidia,amd}/driver: short-circuit before ldconfig
+
+---
+ python/triton/runtime/build.py       | 6 +++---
+ third_party/amd/backend/driver.py    | 7 +++++++
+ third_party/nvidia/backend/driver.py | 3 +++
+ 3 files changed, 13 insertions(+), 3 deletions(-)
+
+diff --git a/python/triton/runtime/build.py b/python/triton/runtime/build.py
+index d334dce77..a64e98da0 100644
+--- a/python/triton/runtime/build.py
++++ b/python/triton/runtime/build.py
+@@ -42,6 +42,9 @@ def _build(name, src, srcdir, library_dirs, include_dirs, libraries):
+     py_include_dir = sysconfig.get_paths(scheme=scheme)["include"]
+     include_dirs = include_dirs + [srcdir, py_include_dir]
+     cc_cmd = [cc, src, "-O3", "-shared", "-fPIC", "-o", so]
++    cc_cmd += [f'-l{lib}' for lib in libraries]
++    cc_cmd += [f"-L{dir}" for dir in library_dirs]
++    cc_cmd += [f"-I{dir}" for dir in include_dirs if dir is not None]
+ 
+     # Nixpkgs support branch
+     # Allows passing e.g. extra -Wl,-rpath
+@@ -50,9 +53,6 @@ def _build(name, src, srcdir, library_dirs, include_dirs, libraries):
+         import shlex
+         cc_cmd.extend(shlex.split(cc_cmd_extra_flags))
+ 
+-    cc_cmd += [f'-l{lib}' for lib in libraries]
+-    cc_cmd += [f"-L{dir}" for dir in library_dirs]
+-    cc_cmd += [f"-I{dir}" for dir in include_dirs if dir is not None]
+     ret = subprocess.check_call(cc_cmd)
+     if ret == 0:
+         return so
+diff --git a/third_party/amd/backend/driver.py b/third_party/amd/backend/driver.py
+index 0a8cd7bed..aab8805f6 100644
+--- a/third_party/amd/backend/driver.py
++++ b/third_party/amd/backend/driver.py
+@@ -24,6 +24,13 @@ def _get_path_to_hip_runtime_dylib():
+             return env_libhip_path
+         raise RuntimeError(f"TRITON_LIBHIP_PATH '{env_libhip_path}' does not point to a valid {lib_name}")
+ 
++    # ...on release/3.1.x:
++    #         return mmapped_path
++    #     raise RuntimeError(f"memory mapped '{mmapped_path}' in process does not point to a valid {lib_name}")
++
++    if os.path.isdir("@libhipDir@"):
++        return ["@libhipDir@"]
++
+     paths = []
+ 
+     import site
+diff --git a/third_party/nvidia/backend/driver.py b/third_party/nvidia/backend/driver.py
+index 90f71138b..30fbadb2a 100644
+--- a/third_party/nvidia/backend/driver.py
++++ b/third_party/nvidia/backend/driver.py
+@@ -21,6 +21,9 @@ def libcuda_dirs():
+     if env_libcuda_path:
+         return [env_libcuda_path]
+ 
++    if os.path.exists("@libcudaStubsDir@"):
++        return ["@libcudaStubsDir@"]
++
+     libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode()
+     # each line looks like the following:
+     # libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1
+-- 
+2.46.0
+
diff --git a/pkgs/development/python-modules/triton/0003-nvidia-cudart-a-systempath.patch b/pkgs/development/python-modules/triton/0003-nvidia-cudart-a-systempath.patch
new file mode 100644
index 0000000000000..144d84e151fe1
--- /dev/null
+++ b/pkgs/development/python-modules/triton/0003-nvidia-cudart-a-systempath.patch
@@ -0,0 +1,46 @@
+From 6f92d54e5a544bc34bb07f2808d554a71cc0e4c3 Mon Sep 17 00:00:00 2001
+From: SomeoneSerge <else+aalto@someonex.net>
+Date: Sun, 13 Oct 2024 14:30:19 +0000
+Subject: [PATCH 3/3] nvidia: cudart a systempath
+
+---
+ third_party/nvidia/backend/driver.c  | 2 +-
+ third_party/nvidia/backend/driver.py | 5 +++--
+ 2 files changed, 4 insertions(+), 3 deletions(-)
+
+diff --git a/third_party/nvidia/backend/driver.c b/third_party/nvidia/backend/driver.c
+index 44524da27..fbdf0d156 100644
+--- a/third_party/nvidia/backend/driver.c
++++ b/third_party/nvidia/backend/driver.c
+@@ -1,4 +1,4 @@
+-#include "cuda.h"
++#include <cuda.h>
+ #include <dlfcn.h>
+ #include <stdbool.h>
+ #define PY_SSIZE_T_CLEAN
+diff --git a/third_party/nvidia/backend/driver.py b/third_party/nvidia/backend/driver.py
+index 30fbadb2a..65c0562ed 100644
+--- a/third_party/nvidia/backend/driver.py
++++ b/third_party/nvidia/backend/driver.py
+@@ -10,7 +10,8 @@ from triton.backends.compiler import GPUTarget
+ from triton.backends.driver import GPUDriver
+ 
+ dirname = os.path.dirname(os.path.realpath(__file__))
+-include_dir = [os.path.join(dirname, "include")]
++import shlex
++include_dir = [*shlex.split("@cudaToolkitIncludeDirs@"), os.path.join(dirname, "include")]
+ libdevice_dir = os.path.join(dirname, "lib")
+ libraries = ['cuda']
+ 
+@@ -149,7 +150,7 @@ def make_launcher(constants, signature, ids):
+     # generate glue code
+     params = [i for i in signature.keys() if i not in constants]
+     src = f"""
+-#include \"cuda.h\"
++#include <cuda.h>
+ #include <stdbool.h>
+ #include <Python.h>
+ #include <dlfcn.h>
+-- 
+2.46.0
+
diff --git a/pkgs/development/python-modules/triton/0004-nvidia-allow-static-ptxas-path.patch b/pkgs/development/python-modules/triton/0004-nvidia-allow-static-ptxas-path.patch
new file mode 100644
index 0000000000000..eea1834d1750b
--- /dev/null
+++ b/pkgs/development/python-modules/triton/0004-nvidia-allow-static-ptxas-path.patch
@@ -0,0 +1,26 @@
+From e503e572b6d444cd27f1cdf124aaf553aa3a8665 Mon Sep 17 00:00:00 2001
+From: SomeoneSerge <else+aalto@someonex.net>
+Date: Mon, 14 Oct 2024 00:12:05 +0000
+Subject: [PATCH 4/4] nvidia: allow static ptxas path
+
+---
+ third_party/nvidia/backend/compiler.py | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
+index 6d7994923..6720e8f97 100644
+--- a/third_party/nvidia/backend/compiler.py
++++ b/third_party/nvidia/backend/compiler.py
+@@ -20,6 +20,9 @@ def _path_to_binary(binary: str):
+         os.path.join(os.path.dirname(__file__), "bin", binary),
+     ]
+ 
++    import shlex
++    paths.extend(shlex.split("@nixpkgsExtraBinaryPaths@"))
++
+     for bin in paths:
+         if os.path.exists(bin) and os.path.isfile(bin):
+             result = subprocess.check_output([bin, "--version"], stderr=subprocess.STDOUT)
+-- 
+2.46.0
+
diff --git a/pkgs/development/python-modules/triton/bin.nix b/pkgs/development/python-modules/triton/bin.nix
index 0189278bc0e9b..6bb67753a8bd7 100644
--- a/pkgs/development/python-modules/triton/bin.nix
+++ b/pkgs/development/python-modules/triton/bin.nix
@@ -5,11 +5,8 @@
   cudaPackages,
   buildPythonPackage,
   fetchurl,
-  isPy38,
-  isPy39,
-  isPy310,
-  isPy311,
   python,
+  pythonOlder,
   autoPatchelfHook,
   filelock,
   lit,
@@ -29,7 +26,7 @@ buildPythonPackage rec {
     in
     fetchurl srcs;
 
-  disabled = !(isPy38 || isPy39 || isPy310 || isPy311);
+  disabled = pythonOlder "3.8";
 
   pythonRemoveDeps = [
     "cmake"
diff --git a/pkgs/development/python-modules/triton/default.nix b/pkgs/development/python-modules/triton/default.nix
index 98585b850e912..6e4c66e4acea6 100644
--- a/pkgs/development/python-modules/triton/default.nix
+++ b/pkgs/development/python-modules/triton/default.nix
@@ -1,111 +1,92 @@
 {
   lib,
-  config,
   addDriverRunpath,
   buildPythonPackage,
-  fetchFromGitHub,
-  fetchpatch,
-  setuptools,
   cmake,
-  ninja,
-  pybind11,
+  config,
+  cudaPackages,
+  fetchFromGitHub,
+  filelock,
   gtest,
-  zlib,
-  ncurses,
   libxml2,
   lit,
   llvm,
-  filelock,
-  torchWithRocm,
+  ncurses,
+  ninja,
+  pybind11,
   python,
-
-  runCommand,
-
-  cudaPackages,
+  pytestCheckHook,
+  stdenv,
+  substituteAll,
+  setuptools,
+  torchWithRocm,
+  zlib,
   cudaSupport ? config.cudaSupport,
+  rocmSupport ? config.rocmSupport,
+  rocmPackages,
+  triton,
 }:
 
-let
-  ptxas = lib.getExe' cudaPackages.cuda_nvcc "ptxas"; # Make sure cudaPackages is the right version each update (See python/setup.py)
-in
-buildPythonPackage rec {
+buildPythonPackage {
   pname = "triton";
-  version = "2.1.0";
+  version = "3.0.0";
   pyproject = true;
 
   src = fetchFromGitHub {
-    owner = "openai";
-    repo = pname;
-    rev = "v${version}";
-    hash = "sha256-8UTUwLH+SriiJnpejdrzz9qIquP2zBp1/uwLdHmv0XQ=";
+    owner = "triton-lang";
+    repo = "triton";
+    # latest branch commit from https://github.com/triton-lang/triton/commits/release/3.0.x/
+    rev = "91f24d87e50cb748b121a6c24e65a01187699c22";
+    hash = "sha256-L5KqiR+TgSyKjEBlkE0yOU1pemMHFk2PhEmxLdbbxUU=";
   };
 
   patches =
     [
-      # fix overflow error
-      (fetchpatch {
-        url = "https://github.com/openai/triton/commit/52c146f66b79b6079bcd28c55312fc6ea1852519.patch";
-        hash = "sha256-098/TCQrzvrBAbQiaVGCMaF3o5Yc3yWDxzwSkzIuAtY=";
+      ./0001-setup.py-introduce-TRITON_OFFLINE_BUILD.patch
+      (substituteAll {
+        src = ./0001-_build-allow-extra-cc-flags.patch;
+        ccCmdExtraFlags = "-Wl,-rpath,${addDriverRunpath.driverLink}/lib";
       })
-
-      # Upstream startded pinning CUDA version and falling back to downloading from Conda
-      # in https://github.com/triton-lang/triton/pull/1574/files#diff-eb8b42d9346d0a5d371facf21a8bfa2d16fb49e213ae7c21f03863accebe0fcfR120-R123
-      ./0000-dont-download-ptxas.patch
+      (substituteAll (
+        {
+          src = ./0002-nvidia-amd-driver-short-circuit-before-ldconfig.patch;
+        }
+        // lib.optionalAttrs rocmSupport { libhipDir = "${lib.getLib rocmPackages.clr}/lib"; }
+        // lib.optionalAttrs cudaSupport {
+          libcudaStubsDir = "${lib.getLib cudaPackages.cuda_cudart}/lib/stubs";
+          ccCmdExtraFlags = "-Wl,-rpath,${addDriverRunpath.driverLink}/lib";
+        }
+      ))
     ]
-    ++ lib.optionals (!cudaSupport) [
-      # triton wants to get ptxas version even if ptxas is not
-      # used, resulting in ptxas not found error.
-      ./0001-ptxas-disable-version-key-for-non-cuda-targets.patch
+    ++ lib.optionals cudaSupport [
+      (substituteAll {
+        src = ./0003-nvidia-cudart-a-systempath.patch;
+        cudaToolkitIncludeDirs = "${lib.getInclude cudaPackages.cuda_cudart}/include";
+      })
+      (substituteAll {
+        src = ./0004-nvidia-allow-static-ptxas-path.patch;
+        nixpkgsExtraBinaryPaths = lib.escapeShellArgs [ (lib.getExe' cudaPackages.cuda_nvcc "ptxas") ];
+      })
     ];
 
-  postPatch =
-    let
-      quote = x: ''"${x}"'';
-      subs.ldFlags =
-        let
-          # Bash was getting weird without linting,
-          # but basically upstream contains [cc, ..., "-lcuda", ...]
-          # and we replace it with [..., "-lcuda", "-L/run/opengl-driver/lib", "-L$stubs", ...]
-          old = [ "-lcuda" ];
-          new = [
-            "-lcuda"
-            "-L${addDriverRunpath.driverLink}"
-            "-L${cudaPackages.cuda_cudart}/lib/stubs/"
-          ];
-        in
-        {
-          oldStr = lib.concatMapStringsSep ", " quote old;
-          newStr = lib.concatMapStringsSep ", " quote new;
-        };
-    in
-    ''
-      # Use our `cmakeFlags` instead and avoid downloading dependencies
-      substituteInPlace python/setup.py \
-        --replace "= get_thirdparty_packages(triton_cache_path)" "= os.environ[\"cmakeFlags\"].split()"
-
-      # Already defined in llvm, when built with -DLLVM_INSTALL_UTILS
-      substituteInPlace bin/CMakeLists.txt \
-        --replace "add_subdirectory(FileCheck)" ""
-
-      # Don't fetch googletest
-      substituteInPlace unittest/CMakeLists.txt \
-        --replace "include (\''${CMAKE_CURRENT_SOURCE_DIR}/googletest.cmake)" ""\
-        --replace "include(GoogleTest)" "find_package(GTest REQUIRED)"
-
-      cat << \EOF >> python/triton/common/build.py
-      def libcuda_dirs():
-          return [ "${addDriverRunpath.driverLink}/lib" ]
-      EOF
-    ''
-    + lib.optionalString cudaSupport ''
-      # Use our linker flags
-      substituteInPlace python/triton/common/build.py \
-        --replace '${subs.ldFlags.oldStr}' '${subs.ldFlags.newStr}'
-    '';
+  postPatch = ''
+    # Use our `cmakeFlags` instead and avoid downloading dependencies
+    # remove any downloads
+    substituteInPlace python/setup.py \
+      --replace-fail "get_json_package_info(), get_pybind11_package_info()" ""\
+      --replace-fail "get_pybind11_package_info(), get_llvm_package_info()" ""\
+      --replace-fail 'packages += ["triton/profiler"]' ""\
+      --replace-fail "curr_version != version" "False"
+
+    # Don't fetch googletest
+    substituteInPlace unittest/CMakeLists.txt \
+      --replace-fail "include (\''${CMAKE_CURRENT_SOURCE_DIR}/googletest.cmake)" ""\
+      --replace-fail "include(GoogleTest)" "find_package(GTest REQUIRED)"
+  '';
+
+  build-system = [ setuptools ];
 
   nativeBuildInputs = [
-    setuptools
-    # pytestCheckHook # Requires torch (circular dependency) and probably needs GPUs:
     cmake
     ninja
 
@@ -125,7 +106,7 @@ buildPythonPackage rec {
     zlib
   ];
 
-  propagatedBuildInputs = [
+  dependencies = [
     filelock
     # triton uses setuptools at runtime:
     # https://github.com/NixOS/nixpkgs/pull/286763/#discussion_r1480392652
@@ -139,91 +120,160 @@ buildPythonPackage rec {
   ];
 
   # Avoid GLIBCXX mismatch with other cuda-enabled python packages
-  preConfigure =
-    ''
-      # Ensure that the build process uses the requested number of cores
-      export MAX_JOBS="$NIX_BUILD_CORES"
-
-      # Upstream's setup.py tries to write cache somewhere in ~/
-      export HOME=$(mktemp -d)
-
-      # Upstream's github actions patch setup.cfg to write base-dir. May be redundant
-      echo "
-      [build_ext]
-      base-dir=$PWD" >> python/setup.cfg
-
-      # The rest (including buildPhase) is relative to ./python/
-      cd python
-    ''
-    + lib.optionalString cudaSupport ''
-      export CC=${cudaPackages.backendStdenv.cc}/bin/cc;
-      export CXX=${cudaPackages.backendStdenv.cc}/bin/c++;
-
-      # Work around download_and_copy_ptxas()
-      mkdir -p $PWD/triton/third_party/cuda/bin
-      ln -s ${ptxas} $PWD/triton/third_party/cuda/bin
-    '';
+  preConfigure = ''
+    # Ensure that the build process uses the requested number of cores
+    export MAX_JOBS="$NIX_BUILD_CORES"
 
-  # CMake is run by setup.py instead
-  dontUseCmakeConfigure = true;
+    # Upstream's setup.py tries to write cache somewhere in ~/
+    export HOME=$(mktemp -d)
+
+    # Upstream's github actions patch setup.cfg to write base-dir. May be redundant
+    echo "
+    [build_ext]
+    base-dir=$PWD" >> python/setup.cfg
 
-  # Setuptools (?) strips runpath and +x flags. Let's just restore the symlink
-  postFixup = lib.optionalString cudaSupport ''
-    rm -f $out/${python.sitePackages}/triton/third_party/cuda/bin/ptxas
-    ln -s ${ptxas} $out/${python.sitePackages}/triton/third_party/cuda/bin/ptxas
+    # The rest (including buildPhase) is relative to ./python/
+    cd python
   '';
 
-  checkInputs = [ cmake ]; # ctest
-  dontUseSetuptoolsCheck = true;
+  env =
+    {
+      TRITON_BUILD_PROTON = "OFF";
+      TRITON_OFFLINE_BUILD = true;
+    }
+    // lib.optionalAttrs cudaSupport {
+      CC = lib.getExe' cudaPackages.backendStdenv.cc "cc";
+      CXX = lib.getExe' cudaPackages.backendStdenv.cc "c++";
+
+      # TODO: Unused because of how TRITON_OFFLINE_BUILD currently works (subject to change)
+      TRITON_PTXAS_PATH = lib.getExe' cudaPackages.cuda_nvcc "ptxas"; # Make sure cudaPackages is the right version each update (See python/setup.py)
+      TRITON_CUOBJDUMP_PATH = lib.getExe' cudaPackages.cuda_cuobjdump "cuobjdump";
+      TRITON_NVDISASM_PATH = lib.getExe' cudaPackages.cuda_nvdisasm "nvdisasm";
+      TRITON_CUDACRT_PATH = lib.getInclude cudaPackages.cuda_nvcc;
+      TRITON_CUDART_PATH = lib.getInclude cudaPackages.cuda_cudart;
+      TRITON_CUPTI_PATH = cudaPackages.cuda_cupti;
+    };
+
+  pythonRemoveDeps = [
+    # Circular dependency, cf. https://github.com/triton-lang/triton/issues/1374
+    "torch"
+
+    # CLI tools without dist-info
+    "cmake"
+    "lit"
+  ];
+
+  # CMake is run by setup.py instead
+  dontUseCmakeConfigure = true;
 
+  nativeCheckInputs = [ cmake ];
   preCheck = ''
     # build/temp* refers to build_ext.build_temp (looked up in the build logs)
     (cd ./build/temp* ; ctest)
-
-    # For pytestCheckHook
-    cd test/unit
   '';
 
-  # Circular dependency on torch
-  # pythonImportsCheck = [
-  #   "triton"
-  #   "triton.language"
-  # ];
+  pythonImportsCheck = [
+    "triton"
+    "triton.language"
+  ];
+
+  passthru.gpuCheck = stdenv.mkDerivation {
+    pname = "triton-pytest";
+    inherit (triton) version src;
+
+    requiredSystemFeatures = [ "cuda" ];
+
+    nativeBuildInputs = [
+      (python.withPackages (ps: [
+        ps.scipy
+        ps.torchWithCuda
+        ps.triton-cuda
+      ]))
+    ];
+
+    dontBuild = true;
+    nativeCheckInputs = [ pytestCheckHook ];
+
+    doCheck = true;
+
+    preCheck = ''
+      cd python/test/unit
+      export HOME=$TMPDIR
+    '';
+    checkPhase = "pytestCheckPhase";
+
+    installPhase = "touch $out";
+  };
 
-  # Ultimately, torch is our test suite:
   passthru.tests = {
+    # Ultimately, torch is our test suite:
     inherit torchWithRocm;
-    # Implemented as alternative to pythonImportsCheck, in case if circular dependency on torch occurs again,
-    # and pythonImportsCheck is commented back.
-    import-triton =
-      runCommand "import-triton"
-        { nativeBuildInputs = [ (python.withPackages (ps: [ ps.triton ])) ]; }
+
+    # Test as `nix run -f "<nixpkgs>" python3Packages.triton.tests.axpy-cuda`
+    # or, using `programs.nix-required-mounts`, as `nix build -f "<nixpkgs>" python3Packages.triton.tests.axpy-cuda.gpuCheck`
+    axpy-cuda =
+      cudaPackages.writeGpuTestPython
+        {
+          libraries = ps: [
+            ps.triton
+            ps.torch-no-triton
+          ];
+        }
         ''
-          python << \EOF
+          # Adopted from Philippe Tillet https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html
+
           import triton
-          import triton.language
-          EOF
-          touch "$out"
-        '';
-  };
+          import triton.language as tl
+          import torch
+          import os
 
-  pythonRemoveDeps = [
-    # Circular dependency, cf. https://github.com/openai/triton/issues/1374
-    "torch"
+          @triton.jit
+          def axpy_kernel(n, a: tl.constexpr, x_ptr, y_ptr, out, BLOCK_SIZE: tl.constexpr):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n
+            x = tl.load(x_ptr + offsets, mask=mask)
+            y = tl.load(y_ptr + offsets, mask=mask)
+            output = a * x + y
+            tl.store(out + offsets, output, mask=mask)
 
-    # CLI tools without dist-info
-    "cmake"
-    "lit"
-  ];
+          def axpy(a, x, y):
+            output = torch.empty_like(x)
+            assert x.is_cuda and y.is_cuda and output.is_cuda
+            n_elements = output.numel()
+
+            def grid(meta):
+              return (triton.cdiv(n_elements, meta['BLOCK_SIZE']), )
+
+            axpy_kernel[grid](n_elements, a, x, y, output, BLOCK_SIZE=1024)
+            return output
+
+          if __name__ == "__main__":
+            if os.environ.get("HOME", None) == "/homeless-shelter":
+              os.environ["HOME"] = os.environ.get("TMPDIR", "/tmp")
+            if "CC" not in os.environ:
+              os.environ["CC"] = "${lib.getExe' cudaPackages.backendStdenv.cc "cc"}"
+            torch.manual_seed(0)
+            size = 12345
+            x = torch.rand(size, device='cuda')
+            y = torch.rand(size, device='cuda')
+            output_torch = 3.14 * x + y
+            output_triton = axpy(3.14, x, y)
+            assert output_torch.sub(output_triton).abs().max().item() < 1e-6
+            print("Triton axpy: OK")
+        '';
+  };
 
   meta = with lib; {
     description = "Language and compiler for writing highly efficient custom Deep-Learning primitives";
-    homepage = "https://github.com/openai/triton";
+    homepage = "https://github.com/triton-lang/triton";
     platforms = platforms.linux;
     license = licenses.mit;
     maintainers = with maintainers; [
       SomeoneSerge
       Madouura
+      derdennisop
     ];
   };
 }
diff --git a/pkgs/development/python-modules/triton/prefetch.sh b/pkgs/development/python-modules/triton/prefetch.sh
deleted file mode 100755
index f218718a5cf30..0000000000000
--- a/pkgs/development/python-modules/triton/prefetch.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/env nix-shell
-#!nix-shell -i bash -p nix-prefetch-scripts
-
-set -eou pipefail
-
-version=$1
-
-linux_bucket="https://download.pytorch.org/whl"
-
-url_and_key_list=(
-  "x86_64-linux-38 $linux_bucket/triton-${version}-0-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl triton-${version}-cp38-cp38-linux_x86_64.whl"
-  "x86_64-linux-39 $linux_bucket/triton-${version}-0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl triton-${version}-cp39-cp39-linux_x86_64.whl"
-  "x86_64-linux-310 $linux_bucket/triton-${version}-0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl triton-${version}-cp310-cp310-linux_x86_64.whl"
-  "x86_64-linux-311 $linux_bucket/triton-${version}-0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl triton-${version}-cp311-cp311-linux_x86_64.whl"
-)
-
-hashfile=binary-hashes-"$version".nix
-echo "  \"$version\" = {" >> $hashfile
-
-for url_and_key in "${url_and_key_list[@]}"; do
-  key=$(echo "$url_and_key" | cut -d' ' -f1)
-  url=$(echo "$url_and_key" | cut -d' ' -f2)
-  name=$(echo "$url_and_key" | cut -d' ' -f3)
-
-  echo "prefetching ${url}..."
-  hash=$(nix hash to-sri --type sha256 `nix-prefetch-url "$url" --name "$name"`)
-
-  cat << EOF >> $hashfile
-    $key = {
-      name = "$name";
-      url = "$url";
-      hash = "$hash";
-    };
-EOF
-
-  echo
-done
-
-echo "  };" >> $hashfile
-echo "done."
diff --git a/pkgs/top-level/python-packages.nix b/pkgs/top-level/python-packages.nix
index c026d04370f7f..b259dff2b346d 100644
--- a/pkgs/top-level/python-packages.nix
+++ b/pkgs/top-level/python-packages.nix
@@ -15717,10 +15717,10 @@ self: super: with self; {
 
   toposort = callPackage ../development/python-modules/toposort { };
 
-  torch = callPackage ../development/python-modules/torch {
-    inherit (pkgs.darwin.apple_sdk.frameworks) Accelerate CoreServices;
-    inherit (pkgs.darwin) libobjc;
-  };
+  torch = callPackage ../development/python-modules/torch { };
+
+  # Required to test triton
+  torch-no-triton = self.torch.override { tritonSupport = false; };
 
   torch-audiomentations = callPackage ../development/python-modules/torch-audiomentations { };