From 987e5a27497173b8d6fa5a11efa7fa4b57b849c3 Mon Sep 17 00:00:00 2001
From: Luna Nova <git@lunnova.dev>
Date: Sat, 11 Oct 2025 15:31:57 -0700
Subject: [PATCH 1/4] rocmPackages.hipblaslt: add zstd to nativeBuildInputs

without this zstd compression of msgpack .dats silently failed
---
 pkgs/development/rocm-modules/6/hipblaslt/default.nix | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pkgs/development/rocm-modules/6/hipblaslt/default.nix b/pkgs/development/rocm-modules/6/hipblaslt/default.nix
index 8e68e0f9e6235..61328d5cd8671 100644
--- a/pkgs/development/rocm-modules/6/hipblaslt/default.nix
+++ b/pkgs/development/rocm-modules/6/hipblaslt/default.nix
@@ -136,6 +136,7 @@ stdenv.mkDerivation (finalAttrs: {
     pkg-config
     ninja
     rocm-smi
+    zstd
   ];
 
   buildInputs = [

From 7baf5a984d40618abc1a53de6c678d2ba8cd7bbe Mon Sep 17 00:00:00 2001
From: Luna Nova <git@lunnova.dev>
Date: Sat, 11 Oct 2025 15:32:42 -0700
Subject: [PATCH 2/4] rocmPackages.hipblaslt: verify .dat files aren't huge in
 preInstallCheck

let's avoid regressing compression in future, oops
---
 .../rocm-modules/6/hipblaslt/default.nix           | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/pkgs/development/rocm-modules/6/hipblaslt/default.nix b/pkgs/development/rocm-modules/6/hipblaslt/default.nix
index 61328d5cd8671..e8061a50b92f7 100644
--- a/pkgs/development/rocm-modules/6/hipblaslt/default.nix
+++ b/pkgs/development/rocm-modules/6/hipblaslt/default.nix
@@ -125,7 +125,7 @@ stdenv.mkDerivation (finalAttrs: {
   '';
 
   doCheck = false;
-  doInstallCheck = false;
+  doInstallCheck = true;
 
   nativeBuildInputs = [
     cmake
@@ -213,6 +213,18 @@ stdenv.mkDerivation (finalAttrs: {
       rmdir $out/bin
     '';
 
+  installCheckPhase =
+    # Verify compression worked and .dat files aren't huge
+    ''
+      runHook preInstallCheck
+      find "$out" -type f -name "*.dat" -size "+2M" -exec sh -c '
+          echo "ERROR: oversized .dat file, check for issues with install compression: {}" >&2
+          exit 1
+      ' {} \;
+      echo "Verified .dat files in $out are not huge"
+      runHook postInstallCheck
+    '';
+
   # If this is false there are no kernels in the output lib
   # supporting the target device
   # so if it's an optional dep it's best to not depend on it

From 4cea628d489cdaef3f43cae9048338f6594715ed Mon Sep 17 00:00:00 2001
From: Luna Nova <git@lunnova.dev>
Date: Sat, 11 Oct 2025 09:44:19 -0700
Subject: [PATCH 3/4] rocmPackages.hipblaslt: apply PR to massively reduce peak
 disk space usage

Peak build dir usage is now 25GB

Partially applies [hipblaslt] Refactor Parallel.py to drop joblib, decimate resource usage
---
 .../6/hipblaslt/Tensile-interning.patch       | 181 ++++
 .../TensileCreateLibrary-refactor.patch       | 868 ++++++++++++++++++
 .../rocm-modules/6/hipblaslt/default.nix      |   9 +-
 3 files changed, 1056 insertions(+), 2 deletions(-)
 create mode 100644 pkgs/development/rocm-modules/6/hipblaslt/Tensile-interning.patch
 create mode 100644 pkgs/development/rocm-modules/6/hipblaslt/TensileCreateLibrary-refactor.patch

diff --git a/pkgs/development/rocm-modules/6/hipblaslt/Tensile-interning.patch b/pkgs/development/rocm-modules/6/hipblaslt/Tensile-interning.patch
new file mode 100644
index 0000000000000..acf44437520db
--- /dev/null
+++ b/pkgs/development/rocm-modules/6/hipblaslt/Tensile-interning.patch
@@ -0,0 +1,181 @@
+diff --git a/tensilelite/Tensile/Common/Utilities.py b/tensilelite/Tensile/Common/Utilities.py
+index 0a9d9db5b3..cb9779eaac 100644
+--- a/tensilelite/Tensile/Common/Utilities.py
++++ b/tensilelite/Tensile/Common/Utilities.py
+@@ -24,6 +24,7 @@
+ 
+ import functools
+ import math
++import operator
+ import os
+ import sys
+ import time
+@@ -269,8 +270,20 @@ def state(obj):
+ 
+ 
+ def state_key_ordering(cls):
+-    def tup(obj):
+-        return tuple([getattr(obj, k) for k in cls.StateKeys])
++    # Use operator.attrgetter for efficiency if __slots__ is defined
++    if hasattr(cls, '__slots__'):
++        # attrgetter is faster for slotted classes
++        getter = operator.attrgetter(*cls.StateKeys)
++        if len(cls.StateKeys) == 1:
++            # attrgetter returns scalar for single key, we need tuple
++            def tup(obj):
++                return (getter(obj),)
++        else:
++            tup = getter
++    else:
++        # Fallback for regular classes
++        def tup(obj):
++            return tuple([getattr(obj, k) for k in cls.StateKeys])
+ 
+     def lt(a, b):
+         return tup(a) < tup(b)
+diff --git a/tensilelite/Tensile/Contractions.py b/tensilelite/Tensile/Contractions.py
+index c0d4e851b1..3f2c2e98c6 100644
+--- a/tensilelite/Tensile/Contractions.py
++++ b/tensilelite/Tensile/Contractions.py
+@@ -37,9 +37,60 @@ from Tensile.Toolchain.Component import Assembler
+ from math import ceil
+ 
+ MIN_K_FOR_GSU = 32
++
++# Interning helpers to reduce memory usage by reusing identical objects
++_free_index_cache = {}
++def intern_free_index(isA, i=None, c=None, d=None, a=None, b=None):
++    key = (isA, i, c, d, a, b)
++    if key not in _free_index_cache:
++        obj = FreeIndex(isA, i, c, d)
++        obj.a = a
++        if b is not None:
++            obj.b = b
++        _free_index_cache[key] = obj
++    return _free_index_cache[key]
++
++_batch_index_cache = {}
++def intern_batch_index(a=None, b=None, c=None, d=None):
++    key = (a, b, c, d)
++    if key not in _batch_index_cache:
++        obj = BatchIndex(c=c, d=d)
++        obj.a = a
++        obj.b = b
++        _batch_index_cache[key] = obj
++    return _batch_index_cache[key]
++
++_bound_index_cache = {}
++def intern_bound_index(a=None, b=None, aMirror=False, bMirror=False):
++    key = (a, b, aMirror, bMirror)
++    if key not in _bound_index_cache:
++        obj = BoundIndex(aMirror=aMirror, bMirror=bMirror)
++        obj.a = a
++        obj.b = b
++        _bound_index_cache[key] = obj
++    return _bound_index_cache[key]
++
++_size_mapping_cache = {}
++def intern_size_mapping(size_mapping):
++    """Intern a SizeMapping instance to reduce redundancy."""
++    # Build hashable key from StateKeys, converting lists to tuples
++    key_parts = []
++    for attr in size_mapping.StateKeys:
++        val = getattr(size_mapping, attr)
++        # Convert lists to tuples for hashing
++        if isinstance(val, list):
++            val = tuple(val)
++        key_parts.append(val)
++    key = tuple(key_parts)
++
++    if key not in _size_mapping_cache:
++        _size_mapping_cache[key] = size_mapping
++    return _size_mapping_cache[key]
++
+ @state_key_ordering
+ class FreeIndex:
+     StateKeys = ['isA', 'i', 'c', 'd']
++    __slots__ = ['isA', 'i', 'c', 'd', 'a', 'b']
+ 
+     def __init__(self, isA, i=None, c=None, d=None):
+         self.isA = isA
+@@ -50,6 +101,7 @@ class FreeIndex:
+ @state_key_ordering
+ class BatchIndex:
+     StateKeys = ['a', 'b', 'c', 'd']
++    __slots__ = ['a', 'b', 'c', 'd']
+     def __init__(self, a=None, b=None, c=None, d=None):
+         self.a = a
+         self.b = b
+@@ -59,6 +111,7 @@ class BatchIndex:
+ @state_key_ordering
+ class BoundIndex:
+     StateKeys = ['a', 'b', 'aMirror', 'bMirror']
++    __slots__ = ['a', 'b', 'aMirror', 'bMirror']
+     def __init__(self, a=None, b=None, aMirror=False, bMirror=False):
+         self.a = a
+         self.b = b
+@@ -107,6 +160,23 @@ class ProblemType:
+         for ib, ic in enumerate(d['IndexAssignmentsB']):
+             indices[ic].b = ib
+ 
++        # Now intern all indices with their final state (including .a and .b)
++        for i, idx in enumerate(indices):
++            if isinstance(idx, FreeIndex):
++                indices[i] = intern_free_index(idx.isA, idx.i, idx.c, idx.d,
++                                              getattr(idx, 'a', None), getattr(idx, 'b', None))
++            elif isinstance(idx, BatchIndex):
++                indices[i] = intern_batch_index(getattr(idx, 'a', None), getattr(idx, 'b', None),
++                                               idx.c, idx.d)
++            elif isinstance(idx, BoundIndex):
++                indices[i] = intern_bound_index(getattr(idx, 'a', None), getattr(idx, 'b', None),
++                                               idx.aMirror, idx.bMirror)
++
++        # Update the lists with interned versions
++        freeIndices = [idx for idx in indices if isinstance(idx, FreeIndex)]
++        batchIndices = [idx for idx in indices if isinstance(idx, BatchIndex)]
++        boundIndices = [idx for idx in indices if isinstance(idx, BoundIndex)]
++
+         for idx in indices:
+             assert idx is not None
+             idxState = state(idx)
+@@ -596,6 +666,7 @@ class SizeMapping:
+                  'nonTemporalA',
+                  'nonTemporalB',
+                  ]
++    __slots__ = StateKeys
+ 
+     @classmethod
+     def FromOriginalState(cls, d):
+@@ -751,7 +822,7 @@ class Solution:
+         info = cls.ReadOriginalInfo(d)
+         rv.libraryLogicIndex = int(info.get("SolutionIndex", -1))
+ 
+-        rv.sizeMapping = SizeMapping.FromOriginalState(d)
++        rv.sizeMapping = intern_size_mapping(SizeMapping.FromOriginalState(d))
+ 
+         rv.internalArgsSupport = InternalArgsSupport.FromOriginalState(d)
+ 
+diff --git a/tensilelite/Tensile/TensileCreateLibrary/Run.py b/tensilelite/Tensile/TensileCreateLibrary/Run.py
+index 730b6b1fff..b0068563a0 100644
+--- a/tensilelite/Tensile/TensileCreateLibrary/Run.py
++++ b/tensilelite/Tensile/TensileCreateLibrary/Run.py
+@@ -104,7 +104,6 @@ class KernelCodeGenResult(NamedTuple):
+     src: str
+     header: Optional[str]
+     name: str
+-    targetObjFilename: str
+     isa: IsaVersion
+     wavefrontSize: int
+     cuoccupancy: int
+@@ -127,10 +126,9 @@ def processKernelSource(kernelWriterAssembly, data, splitGSU, kernel) -> KernelC
+     asmFilename = getKernelFileBase(splitGSU, kernel)
+     err, src = kernelWriter.getSourceFileString(kernel)
+     header = kernelWriter.getHeaderFileString(kernel)
+-    objFilename = kernel._state.get("codeObjectFile", None)
+     pgr = int(kernel["PrefetchGlobalRead"])
+     return KernelCodeGenResult(
+-        err, src, header, asmFilename, objFilename, tuple(kernel["ISA"]), \
++        err, src, header, asmFilename, tuple(kernel["ISA"]), \
+         kernel["WavefrontSize"], kernel["CUOccupancy"], \
+         pgr, kernel["MathClocksUnrolledLoop"]
+     )
diff --git a/pkgs/development/rocm-modules/6/hipblaslt/TensileCreateLibrary-refactor.patch b/pkgs/development/rocm-modules/6/hipblaslt/TensileCreateLibrary-refactor.patch
new file mode 100644
index 0000000000000..eb50942a095dc
--- /dev/null
+++ b/pkgs/development/rocm-modules/6/hipblaslt/TensileCreateLibrary-refactor.patch
@@ -0,0 +1,868 @@
+diff --git a/tensilelite/Tensile/SolutionStructs/Naming.py b/tensilelite/Tensile/SolutionStructs/Naming.py
+index 4f220960db1d..99535e246650 100644
+--- a/tensilelite/Tensile/SolutionStructs/Naming.py
++++ b/tensilelite/Tensile/SolutionStructs/Naming.py
+@@ -105,7 +105,6 @@ def _getName(state, requiredParameters: frozenset, splitGSU: bool, ignoreInterna
+     if splitGSU:
+       state["GlobalSplitU"] = "M" if (state["GlobalSplitU"] > 1 or state["GlobalSplitU"] == -1) else state["GlobalSplitU"]
+ 
+-
+   requiredParametersTemp = set(requiredParameters.union(["GlobalSplitU"]))
+ 
+   if ignoreInternalArgs:
+diff --git a/tensilelite/Tensile/CustomYamlLoader.py b/tensilelite/Tensile/CustomYamlLoader.py
+index bab8c687509..e03f456fbec 100644
+--- a/tensilelite/Tensile/CustomYamlLoader.py
++++ b/tensilelite/Tensile/CustomYamlLoader.py
+@@ -1,3 +1,6 @@
++# Copyright © Advanced Micro Devices, Inc., or its affiliates.
++# SPDX-License-Identifier: MIT
++
+ import yaml
+ from pathlib import Path
+ 
+Author: Luna Nova <git@lunnova.dev>
+Date:   Sun Oct 12 11:52:10 2025 -0700
+
+    [hipblaslt] intern strings to reduce duplicate memory for solution keys
+
+diff --git a/tensilelite/Tensile/CustomYamlLoader.py b/tensilelite/Tensile/CustomYamlLoader.py
+index 685e69220c..9fdf38d8e5 100644
+--- a/tensilelite/Tensile/CustomYamlLoader.py
++++ b/tensilelite/Tensile/CustomYamlLoader.py
+@@ -1,6 +1,7 @@
+ # Copyright © Advanced Micro Devices, Inc., or its affiliates.
+ # SPDX-License-Identifier: MIT
+ 
++import sys
+ import yaml
+ from pathlib import Path
+ 
+@@ -85,7 +86,7 @@ def parse_scalar(loader: yaml.Loader):
+         if not evt.style:
+             return None
+ 
+-    return value
++    return sys.intern(value)
+ 
+ def load_yaml_stream(yaml_path: Path, loader_type: yaml.Loader):
+     with open(yaml_path, 'r') as f:
+
+diff --git a/tensilelite/Tensile/Common/Parallel.py b/tensilelite/Tensile/Common/Parallel.py
+index 1a2bf9e119..f46100c7b8 100644
+--- a/tensilelite/Tensile/Common/Parallel.py
++++ b/tensilelite/Tensile/Common/Parallel.py
+@@ -22,43 +22,58 @@
+ #
+ ################################################################################
+ 
+-import concurrent.futures
+-import itertools
++import multiprocessing
+ import os
++import re
+ import sys
+ import time
+-
+-from joblib import Parallel, delayed
++from functools import partial
++from typing import Any, Callable
+ 
+ from .Utilities import tqdm
+ 
+ 
+-def joblibParallelSupportsGenerator():
+-    import joblib
+-    from packaging.version import Version
++def get_inherited_job_limit() -> int:
++    # 1. Check CMAKE_BUILD_PARALLEL_LEVEL (CMake 3.12+)
++    if 'CMAKE_BUILD_PARALLEL_LEVEL' in os.environ:
++        try:
++            return int(os.environ['CMAKE_BUILD_PARALLEL_LEVEL'])
++        except ValueError:
++            pass
+ 
+-    joblibVer = joblib.__version__
+-    return Version(joblibVer) >= Version("1.4.0")
++    # 2. Parse MAKEFLAGS for -jN
++    makeflags = os.environ.get('MAKEFLAGS', '')
++    match = re.search(r'-j\s*(\d+)', makeflags)
++    if match:
++        return int(match.group(1))
+ 
++    return -1
+ 
+-def CPUThreadCount(enable=True):
+-    from .GlobalParameters import globalParameters
+ 
++def CPUThreadCount(enable=True):
+     if not enable:
+         return 1
+-    else:
++    from .GlobalParameters import globalParameters
++
++    # Priority order:
++    # 1. Inherited from build system (CMAKE_BUILD_PARALLEL_LEVEL or MAKEFLAGS)
++    # 2. Explicit --jobs flag
++    # 3. Auto-detect
++    inherited_limit = get_inherited_job_limit()
++    cpuThreads = inherited_limit if inherited_limit > 0 else globalParameters["CpuThreads"]
++
++    if cpuThreads < 1:
+         if os.name == "nt":
+-            # Windows supports at most 61 workers because the scheduler uses
+-            # WaitForMultipleObjects directly, which has the limit (the limit
+-            # is actually 64, but some handles are needed for accounting).
+-            cpu_count = min(os.cpu_count(), 61)
++            cpuThreads = os.cpu_count()
+         else:
+-            cpu_count = len(os.sched_getaffinity(0))
+-        cpuThreads = globalParameters["CpuThreads"]
+-        if cpuThreads == -1:
+-            return cpu_count
++            cpuThreads = len(os.sched_getaffinity(0))
+ 
+-        return min(cpu_count, cpuThreads)
++    if os.name == "nt":
++        # Windows supports at most 61 workers because the scheduler uses
++        # WaitForMultipleObjects directly, which has the limit (the limit
++        # is actually 64, but some handles are needed for accounting).
++        cpuThreads = min(cpuThreads, 61)
++    return max(1, cpuThreads)
+ 
+ 
+ def pcallWithGlobalParamsMultiArg(f, args, newGlobalParameters):
+@@ -71,19 +86,22 @@ def pcallWithGlobalParamsSingleArg(f, arg, newGlobalParameters):
+     return f(arg)
+ 
+ 
+-def apply_print_exception(item, *args):
+-    # print(item, args)
++def OverwriteGlobalParameters(newGlobalParameters):
++    from . import GlobalParameters
++
++    GlobalParameters.globalParameters.clear()
++    GlobalParameters.globalParameters.update(newGlobalParameters)
++
++
++def worker_function(args, function, multiArg):
++    """Worker function that executes in the pool process."""
+     try:
+-        if len(args) > 0:
+-            func = item
+-            args = args[0]
+-            return func(*args)
++        if multiArg:
++            return function(*args)
+         else:
+-            func, item = item
+-            return func(item)
++            return function(args)
+     except Exception:
+         import traceback
+-
+         traceback.print_exc()
+         raise
+     finally:
+@@ -98,154 +116,121 @@ def OverwriteGlobalParameters(newGlobalParameters):
+     GlobalParameters.globalParameters.update(newGlobalParameters)
+ 
+ 
+-def ProcessingPool(enable=True, maxTasksPerChild=None):
+-    import multiprocessing
+-    import multiprocessing.dummy
+-
+-    threadCount = CPUThreadCount()
+-
+-    if (not enable) or threadCount <= 1:
+-        return multiprocessing.dummy.Pool(1)
+-
+-    if multiprocessing.get_start_method() == "spawn":
+-        from . import GlobalParameters
+-
+-        return multiprocessing.Pool(
+-            threadCount,
+-            initializer=OverwriteGlobalParameters,
+-            maxtasksperchild=maxTasksPerChild,
+-            initargs=(GlobalParameters.globalParameters,),
+-        )
+-    else:
+-        return multiprocessing.Pool(threadCount, maxtasksperchild=maxTasksPerChild)
++def progress_logger(iterable, total, message, min_log_interval=5.0):
++    """
++    Generator that wraps an iterable and logs progress with time-based throttling.
+ 
++    Only logs progress if at least min_log_interval seconds have passed since last log.
++    Only prints completion message if task took >= min_log_interval seconds.
+ 
+-def ParallelMap(function, objects, message="", enable=True, method=None, maxTasksPerChild=None):
++    Yields (index, item) tuples.
+     """
+-    Generally equivalent to list(map(function, objects)), possibly executing in parallel.
+-
+-      message: A message describing the operation to be performed.
+-      enable: May be set to false to disable parallelism.
+-      method: A function which can fetch the mapping function from a processing pool object.
+-          Leave blank to use .map(), other possiblities:
+-             - `lambda x: x.starmap` - useful if `function` takes multiple parameters.
+-             - `lambda x: x.imap` - lazy evaluation
+-             - `lambda x: x.imap_unordered` - lazy evaluation, does not preserve order of return value.
+-    """
+-    from .GlobalParameters import globalParameters
++    start_time = time.time()
++    last_log_time = start_time
++    log_interval = 1 + (total // 100)
+ 
+-    threadCount = CPUThreadCount(enable)
+-    pool = ProcessingPool(enable, maxTasksPerChild)
+-
+-    if threadCount <= 1 and globalParameters["ShowProgressBar"]:
+-        # Provide a progress bar for single-threaded operation.
+-        # This works for method=None, and for starmap.
+-        mapFunc = map
+-        if method is not None:
+-            # itertools provides starmap which can fill in for pool.starmap.  It provides imap on Python 2.7.
+-            # If this works, we will use it, otherwise we will fallback to the "dummy" pool for single threaded
+-            # operation.
+-            try:
+-                mapFunc = method(itertools)
+-            except NameError:
+-                mapFunc = None
+-
+-        if mapFunc is not None:
+-            return list(mapFunc(function, tqdm(objects, message)))
+-
+-    mapFunc = pool.map
+-    if method:
+-        mapFunc = method(pool)
+-
+-    objects = zip(itertools.repeat(function), objects)
+-    function = apply_print_exception
+-
+-    countMessage = ""
+-    try:
+-        countMessage = " for {} tasks".format(len(objects))
+-    except TypeError:
+-        pass
++    for idx, item in enumerate(iterable):
++        if idx % log_interval == 0:
++            current_time = time.time()
++            if (current_time - last_log_time) >= min_log_interval:
++                print(f"{message}\t{idx+1: 5d}/{total: 5d}")
++                last_log_time = current_time
++        yield idx, item
+ 
+-    if message != "":
+-        message += ": "
++    elapsed = time.time() - start_time
++    final_idx = idx + 1 if 'idx' in locals() else 0
+ 
+-    print("{0}Launching {1} threads{2}...".format(message, threadCount, countMessage))
+-    sys.stdout.flush()
+-    currentTime = time.time()
+-    rv = mapFunc(function, objects)
+-    totalTime = time.time() - currentTime
+-    print("{0}Done. ({1:.1f} secs elapsed)".format(message, totalTime))
+-    sys.stdout.flush()
+-    pool.close()
+-    return rv
++    if elapsed >= min_log_interval or last_log_time > start_time:
++        print(f"{message} done in {elapsed:.1f}s!\t{final_idx: 5d}/{total: 5d}")
+ 
+ 
+-def ParallelMapReturnAsGenerator(function, objects, message="", enable=True, multiArg=True):
+-    from .GlobalParameters import globalParameters
++def imap_with_progress(pool, func, iterable, total, message, chunksize):
++    results = []
++    for _, result in progress_logger(pool.imap(func, iterable, chunksize=chunksize), total, message):
++        results.append(result)
++    return results
+ 
+-    threadCount = CPUThreadCount(enable)
+-    print("{0}Launching {1} threads...".format(message, threadCount))
+ 
+-    if threadCount <= 1 and globalParameters["ShowProgressBar"]:
+-        # Provide a progress bar for single-threaded operation.
+-        callFunc = lambda args: function(*args) if multiArg else lambda args: function(args)
+-        return [callFunc(args) for args in tqdm(objects, message)]
++def _ParallelMap_generator(worker, objects, objLen, message, chunksize, threadCount, globalParameters, maxtasksperchild):
++    # separate fn because yield makes the entire fn a generator even if unreachable
++    ctx = multiprocessing.get_context('forkserver' if os.name != 'nt' else 'spawn')
+ 
+-    with concurrent.futures.ProcessPoolExecutor(max_workers=threadCount) as executor:
+-        resultFutures = (executor.submit(function, *arg if multiArg else arg) for arg in objects)
+-        for result in concurrent.futures.as_completed(resultFutures):
+-            yield result.result()
++    with ctx.Pool(processes=threadCount, maxtasksperchild=maxtasksperchild,
++                  initializer=OverwriteGlobalParameters, initargs=(globalParameters,)) as pool:
++        for _, result in progress_logger(pool.imap_unordered(worker, objects, chunksize=chunksize), objLen, message):
++            yield result
+ 
+ 
+ def ParallelMap2(
+-    function, objects, message="", enable=True, multiArg=True, return_as="list", procs=None
++    function: Callable,
++    objects: Any,
++    message: str = "",
++    enable: bool = True,
++    multiArg: bool = True,
++    minChunkSize: int = 1,
++    maxWorkers: int = -1,
++    maxtasksperchild: int = 1024,
++    return_as: str = "list"
+ ):
++    """Executes a function over a list of objects in parallel or sequentially.
++
++    This function is generally equivalent to ``list(map(function, objects))``. However, it provides
++    additional functionality to run in parallel, depending on the 'enable' flag and available CPU
++    threads.
++
++    Args:
++        function: The function to apply to each item in 'objects'. If 'multiArg' is True, 'function'
++                  should accept multiple arguments.
++        objects: An iterable of objects to be processed by 'function'. If 'multiArg' is True, each
++                 item in 'objects' should be an iterable of arguments for 'function'.
++        message: Optional; a message describing the operation. Default is an empty string.
++        enable: Optional; if False, disables parallel execution and runs sequentially. Default is True.
++        multiArg: Optional; if True, treats each item in 'objects' as multiple arguments for
++                  'function'. Default is True.
++        return_as: Optional; "list" (default) or "generator_unordered" for streaming results
++
++    Returns:
++        A list or generator containing the results of applying **function** to each item in **objects**.
+     """
+-    Generally equivalent to list(map(function, objects)), possibly executing in parallel.
++    from .GlobalParameters import globalParameters
+ 
+-      message: A message describing the operation to be performed.
+-      enable: May be set to false to disable parallelism.
+-      multiArg: True if objects represent multiple arguments
+-                  (differentiates multi args vs single collection arg)
+-    """
+-    if return_as in ("generator", "generator_unordered") and not joblibParallelSupportsGenerator():
+-        return ParallelMapReturnAsGenerator(function, objects, message, enable, multiArg)
++    threadCount = CPUThreadCount(enable)
+ 
+-    from .GlobalParameters import globalParameters
++    if not hasattr(objects, "__len__"):
++        objects = list(objects)
+ 
+-    threadCount = procs if procs else CPUThreadCount(enable)
++    objLen = len(objects)
++    if objLen == 0:
++        return [] if return_as == "list" else iter([])
+ 
+-    threadCount = CPUThreadCount(enable)
++    f = (lambda x: function(*x)) if multiArg else function
++    if objLen == 1:
++        print(f"{message}: (1 task)")
++        result = [f(x) for x in objects]
++        return result if return_as == "list" else iter(result)
+ 
+-    if threadCount <= 1 and globalParameters["ShowProgressBar"]:
+-        # Provide a progress bar for single-threaded operation.
+-        return [function(*args) if multiArg else function(args) for args in tqdm(objects, message)]
++    extra_message = (
++        f": {threadCount} thread(s)" + f", {objLen} tasks"
++        if objLen
++        else ""
++    )
+ 
+-    countMessage = ""
+-    try:
+-        countMessage = " for {} tasks".format(len(objects))
+-    except TypeError:
+-        pass
+-
+-    if message != "":
+-        message += ": "
+-    print("{0}Launching {1} threads{2}...".format(message, threadCount, countMessage))
+-    sys.stdout.flush()
+-    currentTime = time.time()
+-
+-    pcall = pcallWithGlobalParamsMultiArg if multiArg else pcallWithGlobalParamsSingleArg
+-    pargs = zip(objects, itertools.repeat(globalParameters))
+-
+-    if joblibParallelSupportsGenerator():
+-        rv = Parallel(n_jobs=threadCount, timeout=99999, return_as=return_as)(
+-            delayed(pcall)(function, a, params) for a, params in pargs
+-        )
++    print(f"ParallelMap {message}{extra_message}")
++
++    if threadCount <= 1:
++        result = [f(x) for x in objects]
++        return result if return_as == "list" else iter(result)
++
++    if maxWorkers > 0:
++        threadCount = min(maxWorkers, threadCount)
++
++    chunksize = max(minChunkSize, objLen // 2000)
++    worker = partial(worker_function, function=function, multiArg=multiArg)
++    if return_as == "generator_unordered":
++        # yield results as they complete without buffering
++        return _ParallelMap_generator(worker, objects, objLen, message, chunksize, threadCount, globalParameters, maxtasksperchild)
+     else:
+-        rv = Parallel(n_jobs=threadCount, timeout=99999)(
+-            delayed(pcall)(function, a, params) for a, params in pargs
+-        )
+-
+-    totalTime = time.time() - currentTime
+-    print("{0}Done. ({1:.1f} secs elapsed)".format(message, totalTime))
+-    sys.stdout.flush()
+-    return rv
++        ctx = multiprocessing.get_context('forkserver' if os.name != 'nt' else 'spawn')
++        with ctx.Pool(processes=threadCount, maxtasksperchild=maxtasksperchild,
++                      initializer=OverwriteGlobalParameters, initargs=(globalParameters,)) as pool:
++            return list(imap_with_progress(pool, worker, objects, objLen, message, chunksize))
+diff --git a/tensilelite/Tensile/CustomKernels.py b/tensilelite/Tensile/CustomKernels.py
+index ffceb636f5..127b3386a1 100644
+--- a/tensilelite/Tensile/CustomKernels.py
++++ b/tensilelite/Tensile/CustomKernels.py
+@@ -24,7 +24,9 @@
+ 
+ from . import CUSTOM_KERNEL_PATH
+ from Tensile.Common.ValidParameters import checkParametersAreValid, validParameters, newMIValidParameters
++from Tensile.CustomYamlLoader import DEFAULT_YAML_LOADER
+ 
++from functools import lru_cache
+ import yaml
+ 
+ import os
+@@ -58,10 +60,13 @@ def getCustomKernelConfigAndAssembly(name, directory=CUSTOM_KERNEL_PATH):
+ 
+     return (config, assembly)
+ 
++# getCustomKernelConfig will get called repeatedly on the same file
++# 20x logic loading speedup for aquavanjaram_Cijk_Ailk_Bljk_F8NH_HHS_BH_Bias_HAS_SAB_SAV_freesize_custom_GSUs
++@lru_cache
+ def readCustomKernelConfig(name, directory=CUSTOM_KERNEL_PATH):
+     rawConfig, _ = getCustomKernelConfigAndAssembly(name, directory)
+     try:
+-        return yaml.safe_load(rawConfig)["custom.config"]
++        return yaml.load(rawConfig, Loader=DEFAULT_YAML_LOADER)["custom.config"]
+     except yaml.scanner.ScannerError as e:
+         raise RuntimeError("Failed to read configuration for custom kernel: {0}\nDetails:\n{1}".format(name, e))
+ 
+diff --git a/tensilelite/Tensile/TensileCreateLibrary/Run.py b/tensilelite/Tensile/TensileCreateLibrary/Run.py
+index 835ed9c019..024c6c49c1 100644
+--- a/tensilelite/Tensile/TensileCreateLibrary/Run.py
++++ b/tensilelite/Tensile/TensileCreateLibrary/Run.py
+@@ -26,8 +26,10 @@ import rocisa
+ 
+ import functools
+ import glob
++import gc
+ import itertools
+ import os
++import resource
+ import shutil
+ from pathlib import Path
+ from timeit import default_timer as timer
+@@ -78,6 +80,25 @@ from Tensile.Utilities.Decorators.Timing import timing
+ from .ParseArguments import parseArguments
+ 
+ 
++def getMemoryUsage():
++    """Get peak and current memory usage in MB."""
++    rusage = resource.getrusage(resource.RUSAGE_SELF)
++    peak_memory_mb = rusage.ru_maxrss / 1024  # KB to MB on Linux
++
++    # Get current memory from /proc/self/status
++    current_memory_mb = 0
++    try:
++        with open('/proc/self/status') as f:
++            for line in f:
++                if line.startswith('VmRSS:'):
++                    current_memory_mb = int(line.split()[1]) / 1024  # KB to MB
++                    break
++    except:
++        current_memory_mb = peak_memory_mb  # Fallback
++
++    return (peak_memory_mb, current_memory_mb)
++
++
+ class KernelCodeGenResult(NamedTuple):
+     err: int
+     src: str
+@@ -115,6 +136,29 @@ def processKernelSource(kernelWriterAssembly, data, splitGSU, kernel) -> KernelC
+     )
+ 
+ 
++def processAndAssembleKernelTCL(kernelWriterAssembly, rocisa_data, splitGSU, kernel, assemblyTmpPath, assembler):
++    """
++    Pipeline function for TCL mode that:
++    1. Generates kernel source
++    2. Writes .s file to disk
++    3. Assembles to .o file
++    4. Deletes .s file
++    """
++    result = processKernelSource(kernelWriterAssembly, rocisa_data, splitGSU, kernel)
++    return writeAndAssembleKernel(result, assemblyTmpPath, assembler)
++
++
++def writeMasterSolutionLibrary(name_lib_tuple, newLibraryDir, splitGSU, libraryFormat):
++    """
++    Write a master solution library to disk.
++    Module-level function to support multiprocessing.
++    """
++    name, lib = name_lib_tuple
++    filename = os.path.join(newLibraryDir, name)
++    lib.applyNaming(splitGSU)
++    LibraryIO.write(filename, state(lib), libraryFormat)
++
++
+ def removeInvalidSolutionsAndKernels(results, kernels, solutions, errorTolerant, printLevel: bool, splitGSU: bool):
+     removeKernels = []
+     removeKernelNames = []
+@@ -189,6 +233,24 @@ def writeAssembly(asmPath: Union[Path, str], result: KernelCodeGenResult):
+     return path, isa, wfsize, minResult
+ 
+ 
++def writeAndAssembleKernel(result: KernelCodeGenResult, asmPath: Union[Path, str], assembler):
++    """Write assembly file and immediately assemble it to .o file"""
++    if result.err:
++        printExit(f"Failed to build kernel {result.name} because it has error code {result.err}")
++
++    path = Path(asmPath) / f"{result.name}.s"
++    with open(path, "w", encoding="utf-8") as f:
++        f.write(result.src)
++
++    # Assemble .s -> .o
++    assembler(isaToGfx(result.isa), result.wavefrontSize, str(path), str(path.with_suffix(".o")))
++
++    # Delete assembly file immediately to save disk space
++    path.unlink()
++
++    return KernelMinResult(result.err, result.cuoccupancy, result.pgr, result.mathclk)
++
++
+ def writeHelpers(
+     outputPath, kernelHelperObjs, KERNEL_HELPER_FILENAME_CPP, KERNEL_HELPER_FILENAME_H
+ ):
+@@ -268,13 +330,14 @@ def writeSolutionsAndKernels(
+     numAsmKernels = len(asmKernels)
+     numKernels = len(asmKernels)
+     assert numKernels == numAsmKernels, "Only assembly kernels are supported in TensileLite"
+-    asmIter = zip(
+-        itertools.repeat(kernelWriterAssembly),
+-        itertools.repeat(rocisa.rocIsa.getInstance().getData()),
+-        itertools.repeat(splitGSU),
+-        asmKernels
++
++    processKernelFn = functools.partial(
++        processKernelSource,
++        kernelWriterAssembly=kernelWriterAssembly,
++        data=rocisa.rocIsa.getInstance().getData(),
++        splitGSU=splitGSU
+     )
+-    asmResults = ParallelMap2(processKernelSource, asmIter, "Generating assembly kernels", return_as="list")
++    asmResults = ParallelMap2(processKernelFn, asmKernels, "Generating assembly kernels", return_as="list", multiArg=False)
+     removeInvalidSolutionsAndKernels(
+         asmResults, asmKernels, solutions, errorTolerant, getVerbosity(), splitGSU
+     )
+@@ -282,19 +345,21 @@ def writeSolutionsAndKernels(
+         asmResults, asmKernels, solutions, splitGSU
+     )
+ 
+-    def assemble(ret):
+-        p, isa, wavefrontsize, result = ret
+-        asmToolchain.assembler(isaToGfx(isa), wavefrontsize, str(p), str(p.with_suffix(".o")))
+-
+-    unaryWriteAssembly = functools.partial(writeAssembly, assemblyTmpPath)
+-    compose = lambda *F: functools.reduce(lambda f, g: lambda x: f(g(x)), F)
++    # Use functools.partial to bind assemblyTmpPath and assembler
++    writeAndAssembleFn = functools.partial(
++        writeAndAssembleKernel,
++        asmPath=assemblyTmpPath,
++        assembler=asmToolchain.assembler
++    )
+     ret = ParallelMap2(
+-        compose(assemble, unaryWriteAssembly),
++        writeAndAssembleFn,
+         asmResults,
+         "Writing assembly kernels",
+         return_as="list",
+         multiArg=False,
+     )
++    del asmResults
++    gc.collect()
+ 
+     writeHelpers(outputPath, kernelHelperObjs, KERNEL_HELPER_FILENAME_CPP, KERNEL_HELPER_FILENAME_H)
+     srcKernelFile = Path(outputPath) / "Kernels.cpp"
+@@ -369,32 +434,31 @@ def writeSolutionsAndKernelsTCL(
+ 
+     uniqueAsmKernels = [k for k in asmKernels if not k.duplicate]
+ 
+-    def assemble(ret):
+-        p, isa, wavefrontsize, result = ret
+-        asmToolchain.assembler(isaToGfx(isa), wavefrontsize, str(p), str(p.with_suffix(".o")))
+-        return result
+-
+-    unaryProcessKernelSource = functools.partial(
+-        processKernelSource,
++    processKernelFn = functools.partial(
++        processAndAssembleKernelTCL,
+         kernelWriterAssembly,
+         rocisa.rocIsa.getInstance().getData(),
+         splitGSU,
++        assemblyTmpPath=assemblyTmpPath,
++        assembler=asmToolchain.assembler
+     )
+ 
+-    unaryWriteAssembly = functools.partial(writeAssembly, assemblyTmpPath)
+-    compose = lambda *F: functools.reduce(lambda f, g: lambda x: f(g(x)), F)
+-    ret = ParallelMap2(
+-        compose(assemble, unaryWriteAssembly, unaryProcessKernelSource),
++    results = ParallelMap2(
++        processKernelFn,
+         uniqueAsmKernels,
+         "Generating assembly kernels",
+         multiArg=False,
+         return_as="list"
+     )
++    del processKernelFn
++    gc.collect()
++
+     passPostKernelInfoToSolution(
+-        ret, uniqueAsmKernels, solutions, splitGSU
++        results, uniqueAsmKernels, solutions, splitGSU
+     )
+-    # result.src is very large so let garbage collector know to clean up
+-    del ret
++    del results
++    gc.collect()
++
+     buildAssemblyCodeObjectFiles(
+         asmToolchain.linker,
+         asmToolchain.bundler,
+@@ -493,6 +557,15 @@ def generateKernelHelperObjects(solutions: List[Solution], cxxCompiler: str, isa
+     return sorted(khos, key=sortByEnum, reverse=True) # Ensure that we write Enum kernel helpers are first in list
+ 
+ 
++def libraryIter(lib: MasterSolutionLibrary):
++    if len(lib.solutions):
++        for i, s in enumerate(lib.solutions.items()):
++            yield (i, *s)
++    else:
++        for _, lazyLib in lib.lazyLibraries.items():
++            yield from libraryIter(lazyLib)
++
++
+ @timing
+ def generateLogicDataAndSolutions(logicFiles, args, assembler: Assembler, isaInfoMap):
+ 
+@@ -508,26 +581,23 @@ def generateLogicDataAndSolutions(logicFiles, args, assembler: Assembler, isaInf
+     printSolutionRejectionReason = True
+     printIndexAssignmentInfo = False
+ 
+-    fIter = zip(
+-        logicFiles,
+-        itertools.repeat(assembler),
+-        itertools.repeat(splitGSU),
+-        itertools.repeat(printSolutionRejectionReason),
+-        itertools.repeat(printIndexAssignmentInfo),
+-        itertools.repeat(isaInfoMap),
+-        itertools.repeat(args["LazyLibraryLoading"]),
++    parseLogicFn = functools.partial(
++        LibraryIO.parseLibraryLogicFile,
++        assembler=assembler,
++        splitGSU=splitGSU,
++        printSolutionRejectionReason=printSolutionRejectionReason,
++        printIndexAssignmentInfo=printIndexAssignmentInfo,
++        isaInfoMap=isaInfoMap,
++        lazyLibraryLoading=args["LazyLibraryLoading"]
+     )
+ 
+-    def libraryIter(lib: MasterSolutionLibrary):
+-        if len(lib.solutions):
+-            for i, s in enumerate(lib.solutions.items()):
+-                yield (i, *s)
+-        else:
+-            for _, lazyLib in lib.lazyLibraries.items():
+-                yield from libraryIter(lazyLib)
+-
+     for library in ParallelMap2(
+-        LibraryIO.parseLibraryLogicFile, fIter, "Loading Logics...", return_as="generator_unordered"
++        parseLogicFn, logicFiles, "Loading Logics...",
++        return_as="generator_unordered",
++        minChunkSize=24,
++        maxWorkers=32,
++        maxtasksperchild=1,
++        multiArg=False,
+     ):
+         _, architectureName, _, _, _, newLibrary = library
+ 
+@@ -539,6 +609,9 @@ def generateLogicDataAndSolutions(logicFiles, args, assembler: Assembler, isaInf
+         else:
+             masterLibraries[architectureName] = newLibrary
+             masterLibraries[architectureName].version = args["CodeObjectVersion"]
++        del library, newLibrary
++
++    gc.collect()
+ 
+     # Sort masterLibraries to make global soln index values deterministic
+     solnReIndex = 0
+@@ -734,6 +807,9 @@ def run():
+     )
+     stop_wsk = timer()
+     print(f"Time to generate kernels (s): {(stop_wsk-start_wsk):3.2f}")
++    numKernelHelperObjs = len(kernelHelperObjs)
++    del kernelWriterAssembly, kernelHelperObjs
++    gc.collect()
+ 
+     archs = [ # is this really different than the other archs above?
+         isaToGfx(arch)
+@@ -751,13 +827,10 @@ def run():
+             if kName not in solDict:
+                 solDict["%s"%kName] = kernel
+ 
+-    def writeMsl(name, lib):
+-        filename = os.path.join(newLibraryDir, name)
+-        lib.applyNaming(splitGSU)
+-        LibraryIO.write(filename, state(lib), arguments["LibraryFormat"])
+-
+     filename = os.path.join(newLibraryDir, "TensileLiteLibrary_lazy_Mapping")
+     LibraryIO.write(filename, libraryMapping, "msgpack")
++    del libraryMapping
++    gc.collect()
+ 
+     start_msl = timer()
+     for archName, newMasterLibrary in masterLibraries.items():
+@@ -774,12 +847,22 @@ def run():
+                     kName = getKeyNoInternalArgs(s.originalSolution, splitGSU)
+                     s.sizeMapping.CUOccupancy = solDict["%s"%kName]["CUOccupancy"]
+ 
+-            ParallelMap2(writeMsl,
++            writeFn = functools.partial(
++                writeMasterSolutionLibrary,
++                newLibraryDir=newLibraryDir,
++                splitGSU=splitGSU,
++                libraryFormat=arguments["LibraryFormat"]
++            )
++
++            ParallelMap2(writeFn,
+                          newMasterLibrary.lazyLibraries.items(),
+                          "Writing master solution libraries",
++                         multiArg=False,
+                          return_as="list")
+     stop_msl = timer()
+     print(f"Time to write master solution libraries (s): {(stop_msl-start_msl):3.2f}")
++    del masterLibraries, solutions, kernels, solDict
++    gc.collect()
+ 
+     if not arguments["KeepBuildTmp"]:
+         buildTmp = Path(arguments["OutputPath"]).parent / "library" / "build_tmp"
+@@ -796,8 +879,11 @@ def run():
+     print("")
+ 
+     stop = timer()
++    peak_memory_mb, current_memory_mb = getMemoryUsage()
+ 
+     print(f"Total time (s): {(stop-start):3.2f}")
+     print(f"Total kernels processed: {numKernels}")
+     print(f"Kernels processed per second: {(numKernels/(stop-start)):3.2f}")
+-    print(f"KernelHelperObjs: {len(kernelHelperObjs)}")
++    print(f"KernelHelperObjs: {numKernelHelperObjs}")
++    print(f"Peak memory usage (MB): {peak_memory_mb:,.1f}")
++    print(f"Current memory usage (MB): {current_memory_mb:,.1f}")
+diff --git a/tensilelite/Tensile/TensileMergeLibrary.py b/tensilelite/Tensile/TensileMergeLibrary.py
+index e33c617b6f..ba163e9918 100644
+--- a/tensilelite/Tensile/TensileMergeLibrary.py
++++ b/tensilelite/Tensile/TensileMergeLibrary.py
+@@ -303,8 +303,7 @@ def avoidRegressions(originalDir, incrementalDir, outputPath, forceMerge, noEff=
+         logicsFiles[origFile] = origFile
+         logicsFiles[incFile] = incFile
+ 
+-    iters = zip(logicsFiles.keys())
+-    logicsList = ParallelMap2(loadData, iters, "Loading Logics...", return_as="list")
++    logicsList = ParallelMap2(loadData, logicsFiles.keys(), "Loading Logics...", return_as="list", multiArg=False)
+     logicsDict = {}
+     for i, _ in enumerate(logicsList):
+         logicsDict[logicsList[i][0]] = logicsList[i][1]
+diff --git a/tensilelite/Tensile/TensileUpdateLibrary.py b/tensilelite/Tensile/TensileUpdateLibrary.py
+index 5ff265d0ed..c1803a6349 100644
+--- a/tensilelite/Tensile/TensileUpdateLibrary.py
++++ b/tensilelite/Tensile/TensileUpdateLibrary.py
+@@ -26,7 +26,7 @@ from . import LibraryIO
+ from .Tensile import addCommonArguments, argUpdatedGlobalParameters
+ 
+ from .Common import assignGlobalParameters, print1, restoreDefaultGlobalParameters, HR, \
+-                    globalParameters, architectureMap, ensurePath, ParallelMap, __version__
++                    globalParameters, architectureMap, ensurePath, ParallelMap2, __version__
+ 
+ import argparse
+ import copy
+@@ -149,7 +149,7 @@ def TensileUpdateLibrary(userArgs):
+     for logicFile in logicFiles:
+         print("#   %s" % logicFile)
+     fIter = zip(logicFiles, itertools.repeat(args.logic_path), itertools.repeat(outputPath))
+-    libraries = ParallelMap(UpdateLogic, fIter, "Updating logic files", method=lambda x: x.starmap)
++    libraries = ParallelMap2(UpdateLogic, fIter, "Updating logic files", multiArg=True, return_as="list")
+ 
+ 
+ def main():
+diff --git a/tensilelite/Tensile/Toolchain/Assembly.py b/tensilelite/Tensile/Toolchain/Assembly.py
+index a8b91e8d62..265e1d532c 100644
+--- a/tensilelite/Tensile/Toolchain/Assembly.py
++++ b/tensilelite/Tensile/Toolchain/Assembly.py
+@@ -30,7 +30,7 @@ import subprocess
+ from pathlib import Path
+ from typing import List, Union, NamedTuple
+ 
+-from Tensile.Common import print2
++from Tensile.Common import print1, print2
+ from Tensile.Common.Architectures import isaToGfx
+ from ..SolutionStructs import Solution
+ 
+@@ -92,8 +92,26 @@ def buildAssemblyCodeObjectFiles(
+         if coName:
+           coFileMap[asmDir / (coName + extCoRaw)].add(str(asmDir / (kernel["BaseName"] + extObj)))
+ 
++      # Build reference count map for .o files to handle shared object files
++      # (.o files from kernels marked .duplicate in TensileCreateLibrary)
++      objFileRefCount = collections.Counter()
++      for coFileRaw, objFiles in coFileMap.items():
++        for objFile in objFiles:
++          objFileRefCount[objFile] += 1
++
++      sharedObjFiles = {objFile: count for objFile, count in objFileRefCount.items() if count > 1}
++      if sharedObjFiles:
++        print1(f"Found {len(sharedObjFiles)} .o files shared across multiple code objects:")
++
+       for coFileRaw, objFiles in coFileMap.items():
+         linker(objFiles, str(coFileRaw))
++
++        # Delete .o files after linking once usage count reaches 0
++        for objFile in objFiles:
++          objFileRefCount[objFile] -= 1
++          if objFileRefCount[objFile] == 0:
++            Path(objFile).unlink()
++
+         coFile = destDir / coFileRaw.name.replace(extCoRaw, extCo)
+         if compress:
+           bundler.compress(str(coFileRaw), str(coFile), gfx)
+diff --git a/tensilelite/Tensile/Toolchain/Component.py b/tensilelite/Tensile/Toolchain/Component.py
+index 67fa35e2d8..dde83af4c3 100644
+--- a/tensilelite/Tensile/Toolchain/Component.py
++++ b/tensilelite/Tensile/Toolchain/Component.py
+@@ -355,6 +355,7 @@ class Linker(Component):
+         when invoking the linker, LLVM allows the provision of arguments via a "response file"
+         Reference: https://llvm.org/docs/CommandLine.html#response-files
+         """
++        # FIXME: this prevents threading as clang_args.txt is overwritten
+         with open(Path.cwd() / "clang_args.txt", "wt") as file:
+             file.write(" ".join(srcPaths).replace('\\', '\\\\') if os_name == "nt" else " ".join(srcPaths))
+         return [*(self.default_args), "-o", destPath, "@clang_args.txt"]
+diff --git a/tensilelite/requirements.txt b/tensilelite/requirements.txt
+index 60c4c11445..5c8fd66a88 100644
+--- a/tensilelite/requirements.txt
++++ b/tensilelite/requirements.txt
+@@ -2,8 +2,6 @@ dataclasses; python_version == '3.6'
+ packaging
+ pyyaml
+ msgpack
+-joblib>=1.4.0; python_version >= '3.8'
+-joblib>=1.1.1; python_version < '3.8'
+ simplejson
+ ujson
+ orjson
diff --git a/pkgs/development/rocm-modules/6/hipblaslt/default.nix b/pkgs/development/rocm-modules/6/hipblaslt/default.nix
index e8061a50b92f7..c5f69d37906ec 100644
--- a/pkgs/development/rocm-modules/6/hipblaslt/default.nix
+++ b/pkgs/development/rocm-modules/6/hipblaslt/default.nix
@@ -22,6 +22,7 @@
   ncurses,
   ninja,
   libffi,
+  jemalloc,
   zlib,
   zstd,
   rocmUpdateScript,
@@ -54,7 +55,6 @@ let
     ps.setuptools
     ps.packaging
     ps.nanobind
-    ps.joblib
     ps.msgpack
   ]);
   # workaround: build for one working target if no targets are supported
@@ -86,6 +86,8 @@ stdenv.mkDerivation (finalAttrs: {
   env.ROCM_PATH = "${clr}";
   env.TENSILE_ROCM_ASSEMBLER_PATH = lib.getExe' clr "amdclang++";
   env.TENSILE_GEN_ASSEMBLY_TOOLCHAIN = lib.getExe' clr "amdclang++";
+  env.LD_PRELOAD = "${jemalloc}/lib/libjemalloc.so";
+  env.MALLOC_CONF = "background_thread:true,metadata_thp:auto,dirty_decay_ms:10000,muzzy_decay_ms:10000";
   requiredSystemFeatures = [ "big-parallel" ];
 
   __structuredAttrs = true;
@@ -114,6 +116,10 @@ stdenv.mkDerivation (finalAttrs: {
     # excessive comments are written to temporary asm files in build dir
     # TODO: report upstream, find a better solution
     ./reduce-comment-spam.patch
+    # [hipblaslt] Refactor Parallel.py to drop joblib, massively reduce peak disk space usage
+    # https://github.com/ROCm/rocm-libraries/pull/2073
+    ./TensileCreateLibrary-refactor.patch
+    ./Tensile-interning.patch
   ];
 
   postPatch = ''
@@ -155,7 +161,6 @@ stdenv.mkDerivation (finalAttrs: {
     msgpack-cxx
     libxml2
     python3Packages.msgpack
-    python3Packages.joblib
     zlib
     zstd
   ]

From bfd0d13dc984b8f6fcedccd2c26f1a2c7b5e3261 Mon Sep 17 00:00:00 2001
From: Luna Nova <git@lunnova.dev>
Date: Sat, 11 Oct 2025 18:07:26 -0700
Subject: [PATCH 4/4] rocmPackages.hipblaslt: remove comment patch

no longer needed now we unlink .s / .o files as soon as possible
---
 .../rocm-modules/6/hipblaslt/default.nix      |  3 --
 .../6/hipblaslt/reduce-comment-spam.patch     | 47 -------------------
 2 files changed, 50 deletions(-)
 delete mode 100644 pkgs/development/rocm-modules/6/hipblaslt/reduce-comment-spam.patch

diff --git a/pkgs/development/rocm-modules/6/hipblaslt/default.nix b/pkgs/development/rocm-modules/6/hipblaslt/default.nix
index c5f69d37906ec..fe395d874d157 100644
--- a/pkgs/development/rocm-modules/6/hipblaslt/default.nix
+++ b/pkgs/development/rocm-modules/6/hipblaslt/default.nix
@@ -113,9 +113,6 @@ stdenv.mkDerivation (finalAttrs: {
     # Support loading zstd compressed .dat files, required to keep output under
     # hydra size limit
     ./messagepack-compression-support.patch
-    # excessive comments are written to temporary asm files in build dir
-    # TODO: report upstream, find a better solution
-    ./reduce-comment-spam.patch
     # [hipblaslt] Refactor Parallel.py to drop joblib, massively reduce peak disk space usage
     # https://github.com/ROCm/rocm-libraries/pull/2073
     ./TensileCreateLibrary-refactor.patch
diff --git a/pkgs/development/rocm-modules/6/hipblaslt/reduce-comment-spam.patch b/pkgs/development/rocm-modules/6/hipblaslt/reduce-comment-spam.patch
deleted file mode 100644
index 32969c5177810..0000000000000
--- a/pkgs/development/rocm-modules/6/hipblaslt/reduce-comment-spam.patch
+++ /dev/null
@@ -1,47 +0,0 @@
-diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/format.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/format.hpp
-index b7dcb6f59a..b0625ba769 100644
---- a/tensilelite/rocisa/rocisa/include/format.hpp
-+++ b/tensilelite/rocisa/rocisa/include/format.hpp
-@@ -8,11 +8,13 @@ namespace rocisa
-     // Text format functions
-     inline std::string slash(const std::string& comment)
-     {
-+        return "";
-         return "// " + comment + "\n";
-     }
- 
-     inline std::string slash50(const std::string& comment)
-     {
-+        return "";
-         std::ostringstream oss;
-         oss << std::setw(50) << ""
-             << " // " << comment << "\n";
-@@ -21,16 +23,19 @@ namespace rocisa
- 
-     inline std::string block(const std::string& comment)
-     {
-+        return "";
-         return "/* " + comment + " */\n";
-     }
- 
-     inline std::string blockNewLine(const std::string& comment)
-     {
-+        return "";
-         return "\n/* " + comment + " */\n";
-     }
- 
-     inline std::string block3Line(const std::string& comment)
-     {
-+        return "";
-         std::ostringstream oss;
-         oss << "\n/******************************************/\n";
-         std::istringstream iss(comment);
-@@ -52,7 +57,7 @@ namespace rocisa
-         {
-             formattedStr = "\"" + formattedStr + "\\n\\t\"";
-         }
--        if(!comment.empty())
-+        if(false)
-         {
-             std::string buffer = formattedStr
-                                  + std::string(std::max(0, 50 - int(formattedStr.length())), ' ')