diff --git a/.devcontainer/cuda12.9-gcc14/devcontainer.json b/.devcontainer/cuda12.9-gcc14/devcontainer.json
new file mode 100644
index 00000000000..74a4d8f852b
--- /dev/null
+++ b/.devcontainer/cuda12.9-gcc14/devcontainer.json
@@ -0,0 +1,66 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.10-cpp-gcc14-cuda12.9",
+  "runArgs": [
+    "--init",
+    "--name",
+    "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-cuda12.9-gcc14"
+  ],
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/{build,wheelhouse}; if test -z ${localEnv:WSLENV}; then docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build >/dev/null; docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/wheelhouse --opt o=bind cccl-wheelhouse >/dev/null; else docker volume create cccl-build >/dev/null; docker volume create cccl-wheelhouse >/dev/null; fi;"
+  ],
+  "postAttachCommand": [
+    "/bin/bash",
+    "-c",
+    "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; fi"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.9-gcc14",
+    "CCCL_CUDA_VERSION": "12.9",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "14",
+    "CCCL_BUILD_INFIX": "cuda12.9-gcc14",
+    "CCCL_CUDA_EXTENDED": "false",
+    "HOST_WORKSPACE": "${localWorkspaceFolder}"
+  },
+  "workspaceFolder": "/home/coder/cccl",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cccl,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build",
+    "source=cccl-wheelhouse,target=/home/coder/cccl/wheelhouse"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "seaube.clangformat",
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cmake-tools"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "seaube.clangformat",
+        "editor.formatOnSave": true,
+        "clang-format.executable": "/usr/bin/clang-format",
+        "clangd.arguments": [
+          "--header-insertion=never",
+          "--compile-commands-dir=${workspaceFolder}"
+        ],
+        "files.eol": "\n",
+        "files.trimTrailingWhitespace": true
+      }
+    }
+  },
+  "name": "cuda12.9-gcc14"
+}
diff --git a/.devcontainer/cuda12.9-nvhpc25.5/devcontainer.json b/.devcontainer/cuda12.9-nvhpc25.5/devcontainer.json
new file mode 100644
index 00000000000..fa2a9aab0f6
--- /dev/null
+++ b/.devcontainer/cuda12.9-nvhpc25.5/devcontainer.json
@@ -0,0 +1,66 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.10-cpp-nvhpc25.5",
+  "runArgs": [
+    "--init",
+    "--name",
+    "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-cuda12.9-nvhpc25.5"
+  ],
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/{build,wheelhouse}; if test -z ${localEnv:WSLENV}; then docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build >/dev/null; docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/wheelhouse --opt o=bind cccl-wheelhouse >/dev/null; else docker volume create cccl-build >/dev/null; docker volume create cccl-wheelhouse >/dev/null; fi;"
+  ],
+  "postAttachCommand": [
+    "/bin/bash",
+    "-c",
+    "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; fi"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.9-nvhpc25.5",
+    "CCCL_CUDA_VERSION": "12.9",
+    "CCCL_HOST_COMPILER": "nvhpc",
+    "CCCL_HOST_COMPILER_VERSION": "25.5",
+    "CCCL_BUILD_INFIX": "cuda12.9-nvhpc25.5",
+    "CCCL_CUDA_EXTENDED": "false",
+    "HOST_WORKSPACE": "${localWorkspaceFolder}"
+  },
+  "workspaceFolder": "/home/coder/cccl",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cccl,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build",
+    "source=cccl-wheelhouse,target=/home/coder/cccl/wheelhouse"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "seaube.clangformat",
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cmake-tools"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "seaube.clangformat",
+        "editor.formatOnSave": true,
+        "clang-format.executable": "/usr/bin/clang-format",
+        "clangd.arguments": [
+          "--header-insertion=never",
+          "--compile-commands-dir=${workspaceFolder}"
+        ],
+        "files.eol": "\n",
+        "files.trimTrailingWhitespace": true
+      }
+    }
+  },
+  "name": "cuda12.9-nvhpc25.5"
+}
diff --git a/.devcontainer/cuda12.9ext-gcc13/devcontainer.json b/.devcontainer/cuda12.9ext-gcc14/devcontainer.json
similarity index 91%
rename from .devcontainer/cuda12.9ext-gcc13/devcontainer.json
rename to .devcontainer/cuda12.9ext-gcc14/devcontainer.json
index 81cba343b58..847b035d735 100644
--- a/.devcontainer/cuda12.9ext-gcc13/devcontainer.json
+++ b/.devcontainer/cuda12.9ext-gcc14/devcontainer.json
@@ -1,10 +1,10 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.10-cpp-gcc13-cuda12.9ext",
+  "image": "rapidsai/devcontainers:25.10-cpp-gcc14-cuda12.9ext",
   "runArgs": [
     "--init",
     "--name",
-    "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-cuda12.9ext-gcc13"
+    "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-cuda12.9ext-gcc14"
   ],
   "hostRequirements": {
     "gpu": "optional"
@@ -24,11 +24,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.9ext-gcc13",
+    "DEVCONTAINER_NAME": "cuda12.9ext-gcc14",
     "CCCL_CUDA_VERSION": "12.9",
     "CCCL_HOST_COMPILER": "gcc",
-    "CCCL_HOST_COMPILER_VERSION": "13",
-    "CCCL_BUILD_INFIX": "cuda12.9ext-gcc13",
+    "CCCL_HOST_COMPILER_VERSION": "14",
+    "CCCL_BUILD_INFIX": "cuda12.9ext-gcc14",
     "CCCL_CUDA_EXTENDED": "true",
     "HOST_WORKSPACE": "${localWorkspaceFolder}"
   },
@@ -62,5 +62,5 @@
       }
     }
   },
-  "name": "cuda12.9ext-gcc13"
+  "name": "cuda12.9ext-gcc14"
 }
diff --git a/.devcontainer/cuda12.9ext-llvm19/devcontainer.json b/.devcontainer/cuda12.9ext-llvm19/devcontainer.json
new file mode 100644
index 00000000000..b671c55e7ad
--- /dev/null
+++ b/.devcontainer/cuda12.9ext-llvm19/devcontainer.json
@@ -0,0 +1,66 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.10-cpp-llvm19-cuda12.9ext",
+  "runArgs": [
+    "--init",
+    "--name",
+    "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-cuda12.9ext-llvm19"
+  ],
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/{build,wheelhouse}; if test -z ${localEnv:WSLENV}; then docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build >/dev/null; docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/wheelhouse --opt o=bind cccl-wheelhouse >/dev/null; else docker volume create cccl-build >/dev/null; docker volume create cccl-wheelhouse >/dev/null; fi;"
+  ],
+  "postAttachCommand": [
+    "/bin/bash",
+    "-c",
+    "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; fi"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.9ext-llvm19",
+    "CCCL_CUDA_VERSION": "12.9",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "19",
+    "CCCL_BUILD_INFIX": "cuda12.9ext-llvm19",
+    "CCCL_CUDA_EXTENDED": "true",
+    "HOST_WORKSPACE": "${localWorkspaceFolder}"
+  },
+  "workspaceFolder": "/home/coder/cccl",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cccl,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build",
+    "source=cccl-wheelhouse,target=/home/coder/cccl/wheelhouse"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "seaube.clangformat",
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cmake-tools"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "seaube.clangformat",
+        "editor.formatOnSave": true,
+        "clang-format.executable": "/usr/bin/clang-format",
+        "clangd.arguments": [
+          "--header-insertion=never",
+          "--compile-commands-dir=${workspaceFolder}"
+        ],
+        "files.eol": "\n",
+        "files.trimTrailingWhitespace": true
+      }
+    }
+  },
+  "name": "cuda12.9ext-llvm19"
+}
diff --git a/.devcontainer/cuda13.0-gcc11/devcontainer.json b/.devcontainer/cuda13.0-gcc11/devcontainer.json
new file mode 100644
index 00000000000..ab1085badd1
--- /dev/null
+++ b/.devcontainer/cuda13.0-gcc11/devcontainer.json
@@ -0,0 +1,66 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.10-cpp-gcc11-cuda13.0",
+  "runArgs": [
+    "--init",
+    "--name",
+    "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-cuda13.0-gcc11"
+  ],
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/{build,wheelhouse}; if test -z ${localEnv:WSLENV}; then docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build >/dev/null; docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/wheelhouse --opt o=bind cccl-wheelhouse >/dev/null; else docker volume create cccl-build >/dev/null; docker volume create cccl-wheelhouse >/dev/null; fi;"
+  ],
+  "postAttachCommand": [
+    "/bin/bash",
+    "-c",
+    "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; fi"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda13.0-gcc11",
+    "CCCL_CUDA_VERSION": "13.0",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "11",
+    "CCCL_BUILD_INFIX": "cuda13.0-gcc11",
+    "CCCL_CUDA_EXTENDED": "false",
+    "HOST_WORKSPACE": "${localWorkspaceFolder}"
+  },
+  "workspaceFolder": "/home/coder/cccl",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cccl,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build",
+    "source=cccl-wheelhouse,target=/home/coder/cccl/wheelhouse"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "seaube.clangformat",
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cmake-tools"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "seaube.clangformat",
+        "editor.formatOnSave": true,
+        "clang-format.executable": "/usr/bin/clang-format",
+        "clangd.arguments": [
+          "--header-insertion=never",
+          "--compile-commands-dir=${workspaceFolder}"
+        ],
+        "files.eol": "\n",
+        "files.trimTrailingWhitespace": true
+      }
+    }
+  },
+  "name": "cuda13.0-gcc11"
+}
diff --git a/.devcontainer/cuda13.0-gcc12/devcontainer.json b/.devcontainer/cuda13.0-gcc12/devcontainer.json
new file mode 100644
index 00000000000..533843e2c09
--- /dev/null
+++ b/.devcontainer/cuda13.0-gcc12/devcontainer.json
@@ -0,0 +1,66 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.10-cpp-gcc12-cuda13.0",
+  "runArgs": [
+    "--init",
+    "--name",
+    "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-cuda13.0-gcc12"
+  ],
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/{build,wheelhouse}; if test -z ${localEnv:WSLENV}; then docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build >/dev/null; docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/wheelhouse --opt o=bind cccl-wheelhouse >/dev/null; else docker volume create cccl-build >/dev/null; docker volume create cccl-wheelhouse >/dev/null; fi;"
+  ],
+  "postAttachCommand": [
+    "/bin/bash",
+    "-c",
+    "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; fi"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda13.0-gcc12",
+    "CCCL_CUDA_VERSION": "13.0",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "12",
+    "CCCL_BUILD_INFIX": "cuda13.0-gcc12",
+    "CCCL_CUDA_EXTENDED": "false",
+    "HOST_WORKSPACE": "${localWorkspaceFolder}"
+  },
+  "workspaceFolder": "/home/coder/cccl",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cccl,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build",
+    "source=cccl-wheelhouse,target=/home/coder/cccl/wheelhouse"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "seaube.clangformat",
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cmake-tools"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "seaube.clangformat",
+        "editor.formatOnSave": true,
+        "clang-format.executable": "/usr/bin/clang-format",
+        "clangd.arguments": [
+          "--header-insertion=never",
+          "--compile-commands-dir=${workspaceFolder}"
+        ],
+        "files.eol": "\n",
+        "files.trimTrailingWhitespace": true
+      }
+    }
+  },
+  "name": "cuda13.0-gcc12"
+}
diff --git a/.devcontainer/cuda13.0-gcc13/devcontainer.json b/.devcontainer/cuda13.0-gcc13/devcontainer.json
new file mode 100644
index 00000000000..fdbe506273f
--- /dev/null
+++ b/.devcontainer/cuda13.0-gcc13/devcontainer.json
@@ -0,0 +1,66 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.10-cpp-gcc13-cuda13.0",
+  "runArgs": [
+    "--init",
+    "--name",
+    "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-cuda13.0-gcc13"
+  ],
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/{build,wheelhouse}; if test -z ${localEnv:WSLENV}; then docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build >/dev/null; docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/wheelhouse --opt o=bind cccl-wheelhouse >/dev/null; else docker volume create cccl-build >/dev/null; docker volume create cccl-wheelhouse >/dev/null; fi;"
+  ],
+  "postAttachCommand": [
+    "/bin/bash",
+    "-c",
+    "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; fi"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda13.0-gcc13",
+    "CCCL_CUDA_VERSION": "13.0",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "13",
+    "CCCL_BUILD_INFIX": "cuda13.0-gcc13",
+    "CCCL_CUDA_EXTENDED": "false",
+    "HOST_WORKSPACE": "${localWorkspaceFolder}"
+  },
+  "workspaceFolder": "/home/coder/cccl",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cccl,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build",
+    "source=cccl-wheelhouse,target=/home/coder/cccl/wheelhouse"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "seaube.clangformat",
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cmake-tools"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "seaube.clangformat",
+        "editor.formatOnSave": true,
+        "clang-format.executable": "/usr/bin/clang-format",
+        "clangd.arguments": [
+          "--header-insertion=never",
+          "--compile-commands-dir=${workspaceFolder}"
+        ],
+        "files.eol": "\n",
+        "files.trimTrailingWhitespace": true
+      }
+    }
+  },
+  "name": "cuda13.0-gcc13"
+}
diff --git a/.devcontainer/cuda13.0-gcc14/devcontainer.json b/.devcontainer/cuda13.0-gcc14/devcontainer.json
new file mode 100644
index 00000000000..c3b3730cd5b
--- /dev/null
+++ b/.devcontainer/cuda13.0-gcc14/devcontainer.json
@@ -0,0 +1,66 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.10-cpp-gcc14-cuda13.0",
+  "runArgs": [
+    "--init",
+    "--name",
+    "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-cuda13.0-gcc14"
+  ],
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/{build,wheelhouse}; if test -z ${localEnv:WSLENV}; then docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build >/dev/null; docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/wheelhouse --opt o=bind cccl-wheelhouse >/dev/null; else docker volume create cccl-build >/dev/null; docker volume create cccl-wheelhouse >/dev/null; fi;"
+  ],
+  "postAttachCommand": [
+    "/bin/bash",
+    "-c",
+    "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; fi"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda13.0-gcc14",
+    "CCCL_CUDA_VERSION": "13.0",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "14",
+    "CCCL_BUILD_INFIX": "cuda13.0-gcc14",
+    "CCCL_CUDA_EXTENDED": "false",
+    "HOST_WORKSPACE": "${localWorkspaceFolder}"
+  },
+  "workspaceFolder": "/home/coder/cccl",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cccl,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build",
+    "source=cccl-wheelhouse,target=/home/coder/cccl/wheelhouse"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "seaube.clangformat",
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cmake-tools"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "seaube.clangformat",
+        "editor.formatOnSave": true,
+        "clang-format.executable": "/usr/bin/clang-format",
+        "clangd.arguments": [
+          "--header-insertion=never",
+          "--compile-commands-dir=${workspaceFolder}"
+        ],
+        "files.eol": "\n",
+        "files.trimTrailingWhitespace": true
+      }
+    }
+  },
+  "name": "cuda13.0-gcc14"
+}
diff --git a/.devcontainer/cuda13.0-llvm15/devcontainer.json b/.devcontainer/cuda13.0-llvm15/devcontainer.json
new file mode 100644
index 00000000000..67f1d8daf57
--- /dev/null
+++ b/.devcontainer/cuda13.0-llvm15/devcontainer.json
@@ -0,0 +1,66 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.10-cpp-llvm15-cuda13.0",
+  "runArgs": [
+    "--init",
+    "--name",
+    "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-cuda13.0-llvm15"
+  ],
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/{build,wheelhouse}; if test -z ${localEnv:WSLENV}; then docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build >/dev/null; docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/wheelhouse --opt o=bind cccl-wheelhouse >/dev/null; else docker volume create cccl-build >/dev/null; docker volume create cccl-wheelhouse >/dev/null; fi;"
+  ],
+  "postAttachCommand": [
+    "/bin/bash",
+    "-c",
+    "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; fi"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda13.0-llvm15",
+    "CCCL_CUDA_VERSION": "13.0",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "15",
+    "CCCL_BUILD_INFIX": "cuda13.0-llvm15",
+    "CCCL_CUDA_EXTENDED": "false",
+    "HOST_WORKSPACE": "${localWorkspaceFolder}"
+  },
+  "workspaceFolder": "/home/coder/cccl",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cccl,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build",
+    "source=cccl-wheelhouse,target=/home/coder/cccl/wheelhouse"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "seaube.clangformat",
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cmake-tools"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "seaube.clangformat",
+        "editor.formatOnSave": true,
+        "clang-format.executable": "/usr/bin/clang-format",
+        "clangd.arguments": [
+          "--header-insertion=never",
+          "--compile-commands-dir=${workspaceFolder}"
+        ],
+        "files.eol": "\n",
+        "files.trimTrailingWhitespace": true
+      }
+    }
+  },
+  "name": "cuda13.0-llvm15"
+}
diff --git a/.devcontainer/cuda13.0-llvm16/devcontainer.json b/.devcontainer/cuda13.0-llvm16/devcontainer.json
new file mode 100644
index 00000000000..8570a7bbfb8
--- /dev/null
+++ b/.devcontainer/cuda13.0-llvm16/devcontainer.json
@@ -0,0 +1,66 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.10-cpp-llvm16-cuda13.0",
+  "runArgs": [
+    "--init",
+    "--name",
+    "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-cuda13.0-llvm16"
+  ],
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/{build,wheelhouse}; if test -z ${localEnv:WSLENV}; then docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build >/dev/null; docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/wheelhouse --opt o=bind cccl-wheelhouse >/dev/null; else docker volume create cccl-build >/dev/null; docker volume create cccl-wheelhouse >/dev/null; fi;"
+  ],
+  "postAttachCommand": [
+    "/bin/bash",
+    "-c",
+    "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; fi"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda13.0-llvm16",
+    "CCCL_CUDA_VERSION": "13.0",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "16",
+    "CCCL_BUILD_INFIX": "cuda13.0-llvm16",
+    "CCCL_CUDA_EXTENDED": "false",
+    "HOST_WORKSPACE": "${localWorkspaceFolder}"
+  },
+  "workspaceFolder": "/home/coder/cccl",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cccl,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build",
+    "source=cccl-wheelhouse,target=/home/coder/cccl/wheelhouse"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "seaube.clangformat",
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cmake-tools"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "seaube.clangformat",
+        "editor.formatOnSave": true,
+        "clang-format.executable": "/usr/bin/clang-format",
+        "clangd.arguments": [
+          "--header-insertion=never",
+          "--compile-commands-dir=${workspaceFolder}"
+        ],
+        "files.eol": "\n",
+        "files.trimTrailingWhitespace": true
+      }
+    }
+  },
+  "name": "cuda13.0-llvm16"
+}
diff --git a/.devcontainer/cuda13.0-llvm17/devcontainer.json b/.devcontainer/cuda13.0-llvm17/devcontainer.json
new file mode 100644
index 00000000000..034ad5d3e91
--- /dev/null
+++ b/.devcontainer/cuda13.0-llvm17/devcontainer.json
@@ -0,0 +1,66 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.10-cpp-llvm17-cuda13.0",
+  "runArgs": [
+    "--init",
+    "--name",
+    "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-cuda13.0-llvm17"
+  ],
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/{build,wheelhouse}; if test -z ${localEnv:WSLENV}; then docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build >/dev/null; docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/wheelhouse --opt o=bind cccl-wheelhouse >/dev/null; else docker volume create cccl-build >/dev/null; docker volume create cccl-wheelhouse >/dev/null; fi;"
+  ],
+  "postAttachCommand": [
+    "/bin/bash",
+    "-c",
+    "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; fi"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda13.0-llvm17",
+    "CCCL_CUDA_VERSION": "13.0",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "17",
+    "CCCL_BUILD_INFIX": "cuda13.0-llvm17",
+    "CCCL_CUDA_EXTENDED": "false",
+    "HOST_WORKSPACE": "${localWorkspaceFolder}"
+  },
+  "workspaceFolder": "/home/coder/cccl",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cccl,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build",
+    "source=cccl-wheelhouse,target=/home/coder/cccl/wheelhouse"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "seaube.clangformat",
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cmake-tools"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "seaube.clangformat",
+        "editor.formatOnSave": true,
+        "clang-format.executable": "/usr/bin/clang-format",
+        "clangd.arguments": [
+          "--header-insertion=never",
+          "--compile-commands-dir=${workspaceFolder}"
+        ],
+        "files.eol": "\n",
+        "files.trimTrailingWhitespace": true
+      }
+    }
+  },
+  "name": "cuda13.0-llvm17"
+}
diff --git a/.devcontainer/cuda13.0-llvm18/devcontainer.json b/.devcontainer/cuda13.0-llvm18/devcontainer.json
new file mode 100644
index 00000000000..c3e8c8779b6
--- /dev/null
+++ b/.devcontainer/cuda13.0-llvm18/devcontainer.json
@@ -0,0 +1,66 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.10-cpp-llvm18-cuda13.0",
+  "runArgs": [
+    "--init",
+    "--name",
+    "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-cuda13.0-llvm18"
+  ],
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/{build,wheelhouse}; if test -z ${localEnv:WSLENV}; then docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build >/dev/null; docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/wheelhouse --opt o=bind cccl-wheelhouse >/dev/null; else docker volume create cccl-build >/dev/null; docker volume create cccl-wheelhouse >/dev/null; fi;"
+  ],
+  "postAttachCommand": [
+    "/bin/bash",
+    "-c",
+    "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; fi"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda13.0-llvm18",
+    "CCCL_CUDA_VERSION": "13.0",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "18",
+    "CCCL_BUILD_INFIX": "cuda13.0-llvm18",
+    "CCCL_CUDA_EXTENDED": "false",
+    "HOST_WORKSPACE": "${localWorkspaceFolder}"
+  },
+  "workspaceFolder": "/home/coder/cccl",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cccl,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build",
+    "source=cccl-wheelhouse,target=/home/coder/cccl/wheelhouse"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "seaube.clangformat",
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cmake-tools"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "seaube.clangformat",
+        "editor.formatOnSave": true,
+        "clang-format.executable": "/usr/bin/clang-format",
+        "clangd.arguments": [
+          "--header-insertion=never",
+          "--compile-commands-dir=${workspaceFolder}"
+        ],
+        "files.eol": "\n",
+        "files.trimTrailingWhitespace": true
+      }
+    }
+  },
+  "name": "cuda13.0-llvm18"
+}
diff --git a/.devcontainer/cuda13.0-llvm19/devcontainer.json b/.devcontainer/cuda13.0-llvm19/devcontainer.json
new file mode 100644
index 00000000000..9741e4091ee
--- /dev/null
+++ b/.devcontainer/cuda13.0-llvm19/devcontainer.json
@@ -0,0 +1,66 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.10-cpp-llvm19-cuda13.0",
+  "runArgs": [
+    "--init",
+    "--name",
+    "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-cuda13.0-llvm19"
+  ],
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/{build,wheelhouse}; if test -z ${localEnv:WSLENV}; then docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build >/dev/null; docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/wheelhouse --opt o=bind cccl-wheelhouse >/dev/null; else docker volume create cccl-build >/dev/null; docker volume create cccl-wheelhouse >/dev/null; fi;"
+  ],
+  "postAttachCommand": [
+    "/bin/bash",
+    "-c",
+    "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; fi"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda13.0-llvm19",
+    "CCCL_CUDA_VERSION": "13.0",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "19",
+    "CCCL_BUILD_INFIX": "cuda13.0-llvm19",
+    "CCCL_CUDA_EXTENDED": "false",
+    "HOST_WORKSPACE": "${localWorkspaceFolder}"
+  },
+  "workspaceFolder": "/home/coder/cccl",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cccl,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build",
+    "source=cccl-wheelhouse,target=/home/coder/cccl/wheelhouse"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "seaube.clangformat",
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cmake-tools"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "seaube.clangformat",
+        "editor.formatOnSave": true,
+        "clang-format.executable": "/usr/bin/clang-format",
+        "clangd.arguments": [
+          "--header-insertion=never",
+          "--compile-commands-dir=${workspaceFolder}"
+        ],
+        "files.eol": "\n",
+        "files.trimTrailingWhitespace": true
+      }
+    }
+  },
+  "name": "cuda13.0-llvm19"
+}
diff --git a/.devcontainer/cuda13.0ext-gcc14/devcontainer.json b/.devcontainer/cuda13.0ext-gcc14/devcontainer.json
new file mode 100644
index 00000000000..96d93260d12
--- /dev/null
+++ b/.devcontainer/cuda13.0ext-gcc14/devcontainer.json
@@ -0,0 +1,66 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.10-cpp-gcc14-cuda13.0ext",
+  "runArgs": [
+    "--init",
+    "--name",
+    "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-cuda13.0ext-gcc14"
+  ],
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/{build,wheelhouse}; if test -z ${localEnv:WSLENV}; then docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build >/dev/null; docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/wheelhouse --opt o=bind cccl-wheelhouse >/dev/null; else docker volume create cccl-build >/dev/null; docker volume create cccl-wheelhouse >/dev/null; fi;"
+  ],
+  "postAttachCommand": [
+    "/bin/bash",
+    "-c",
+    "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; fi"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda13.0ext-gcc14",
+    "CCCL_CUDA_VERSION": "13.0",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "14",
+    "CCCL_BUILD_INFIX": "cuda13.0ext-gcc14",
+    "CCCL_CUDA_EXTENDED": "true",
+    "HOST_WORKSPACE": "${localWorkspaceFolder}"
+  },
+  "workspaceFolder": "/home/coder/cccl",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cccl,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build",
+    "source=cccl-wheelhouse,target=/home/coder/cccl/wheelhouse"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "seaube.clangformat",
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cmake-tools"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "seaube.clangformat",
+        "editor.formatOnSave": true,
+        "clang-format.executable": "/usr/bin/clang-format",
+        "clangd.arguments": [
+          "--header-insertion=never",
+          "--compile-commands-dir=${workspaceFolder}"
+        ],
+        "files.eol": "\n",
+        "files.trimTrailingWhitespace": true
+      }
+    }
+  },
+  "name": "cuda13.0ext-gcc14"
+}
diff --git a/.devcontainer/cuda13.0ext-llvm19/devcontainer.json b/.devcontainer/cuda13.0ext-llvm19/devcontainer.json
new file mode 100644
index 00000000000..9e8cb48f8f4
--- /dev/null
+++ b/.devcontainer/cuda13.0ext-llvm19/devcontainer.json
@@ -0,0 +1,66 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.10-cpp-llvm19-cuda13.0ext",
+  "runArgs": [
+    "--init",
+    "--name",
+    "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-cuda13.0ext-llvm19"
+  ],
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/{build,wheelhouse}; if test -z ${localEnv:WSLENV}; then docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build >/dev/null; docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/wheelhouse --opt o=bind cccl-wheelhouse >/dev/null; else docker volume create cccl-build >/dev/null; docker volume create cccl-wheelhouse >/dev/null; fi;"
+  ],
+  "postAttachCommand": [
+    "/bin/bash",
+    "-c",
+    "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; fi"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda13.0ext-llvm19",
+    "CCCL_CUDA_VERSION": "13.0",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "19",
+    "CCCL_BUILD_INFIX": "cuda13.0ext-llvm19",
+    "CCCL_CUDA_EXTENDED": "true",
+    "HOST_WORKSPACE": "${localWorkspaceFolder}"
+  },
+  "workspaceFolder": "/home/coder/cccl",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cccl,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build",
+    "source=cccl-wheelhouse,target=/home/coder/cccl/wheelhouse"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "seaube.clangformat",
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cmake-tools"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "seaube.clangformat",
+        "editor.formatOnSave": true,
+        "clang-format.executable": "/usr/bin/clang-format",
+        "clangd.arguments": [
+          "--header-insertion=never",
+          "--compile-commands-dir=${workspaceFolder}"
+        ],
+        "files.eol": "\n",
+        "files.trimTrailingWhitespace": true
+      }
+    }
+  },
+  "name": "cuda13.0ext-llvm19"
+}
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 459c8e21601..c3b3730cd5b 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -1,10 +1,10 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.10-cpp-gcc13-cuda12.9",
+  "image": "rapidsai/devcontainers:25.10-cpp-gcc14-cuda13.0",
   "runArgs": [
     "--init",
     "--name",
-    "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-cuda12.9-gcc13"
+    "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-cuda13.0-gcc14"
   ],
   "hostRequirements": {
     "gpu": "optional"
@@ -24,11 +24,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.9-gcc13",
-    "CCCL_CUDA_VERSION": "12.9",
+    "DEVCONTAINER_NAME": "cuda13.0-gcc14",
+    "CCCL_CUDA_VERSION": "13.0",
     "CCCL_HOST_COMPILER": "gcc",
-    "CCCL_HOST_COMPILER_VERSION": "13",
-    "CCCL_BUILD_INFIX": "cuda12.9-gcc13",
+    "CCCL_HOST_COMPILER_VERSION": "14",
+    "CCCL_BUILD_INFIX": "cuda13.0-gcc14",
     "CCCL_CUDA_EXTENDED": "false",
     "HOST_WORKSPACE": "${localWorkspaceFolder}"
   },
@@ -62,5 +62,5 @@
       }
     }
   },
-  "name": "cuda12.9-gcc13"
+  "name": "cuda13.0-gcc14"
 }
diff --git a/.devcontainer/make_devcontainers.sh b/.devcontainer/make_devcontainers.sh
index 36a197067c8..afc553423a7 100755
--- a/.devcontainer/make_devcontainers.sh
+++ b/.devcontainer/make_devcontainers.sh
@@ -126,6 +126,10 @@ fi
 # Get the devcontainer image version and define image tag root
 readonly DEVCONTAINER_VERSION=$(echo "$matrix_json" | jq -r '.devcontainer_version')
 
+# Internal image compiler versions:
+readonly CUDA99_GCC_VERSION=$( echo "$matrix_json" | jq -r '.cuda99_gcc_version')
+readonly CUDA99_LLVM_VERSION=$(echo "$matrix_json" | jq -r '.cuda99_clang_version')
+
 # Get unique combinations of cuda version, compiler name/version, and Ubuntu version
 readonly combinations=$(echo "$matrix_json" | jq -c '.combinations[]')
 
@@ -145,15 +149,8 @@ readonly DEFAULT_NAME=$(make_name "$DEFAULT_CUDA" "$DEFAULT_CUDA_EXT" "$DEFAULT_
 update_devcontainer ${base_devcontainer_file} "./temp_devcontainer.json" "$DEFAULT_NAME" "$DEFAULT_CUDA" "$DEFAULT_CUDA_EXT" "$DEFAULT_COMPILER_NAME" "$DEFAULT_COMPILER_EXE" "$DEFAULT_COMPILER_VERSION" "$DEVCONTAINER_VERSION" "false"
 mv "./temp_devcontainer.json" ${base_devcontainer_file}
 
-# Always create an extended version of the default devcontainer:
-readonly EXT_NAME=$(make_name "$DEFAULT_CUDA" true "$DEFAULT_COMPILER_NAME" "$DEFAULT_COMPILER_VERSION")
-update_devcontainer ${base_devcontainer_file} "./temp_devcontainer.json" "$EXT_NAME" "$DEFAULT_CUDA" true "$DEFAULT_COMPILER_NAME" "$DEFAULT_COMPILER_EXE" "$DEFAULT_COMPILER_VERSION" "$DEVCONTAINER_VERSION" "false"
-mkdir -p "$EXT_NAME"
-mv "./temp_devcontainer.json" "$EXT_NAME/devcontainer.json"
-
-
 # Create an array to keep track of valid subdirectory names
-valid_subdirs=("$EXT_NAME")
+valid_subdirs=()
 
 # The img folder should not be removed:
 valid_subdirs+=("img")
@@ -164,10 +161,27 @@ for rapids_container in *rapids*; do
 done
 
 # Inject ctk version 99.9
-readonly cuda99_9_gcc=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -rsc '.[].cuda |= "99.9" | .[].internal |= true | .[-1]')
-readonly cuda99_8_gcc=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -rsc '.[].cuda |= "99.8" | .[].internal |= true | .[-1]')
-readonly cuda99_9_llvm=$(echo "$NEWEST_LLVM_CUDA_ENTRY" | jq -rsc '.[].cuda |= "99.9" | .[].internal |= true | .[-1]')
-readonly cuda99_8_llvm=$(echo "$NEWEST_LLVM_CUDA_ENTRY" | jq -rsc '.[].cuda |= "99.8" | .[].internal |= true | .[-1]')
+make_compiler_entry() {
+    local compiler_name="$1"
+    local compiler_version="$2"
+    local compiler_exe="$3"
+    local cuda_version="$4"
+    local cuda_ext="$5"
+    local internal="${6:-false}"
+    echo "{
+        \"cuda\": \"$cuda_version\",
+        \"cuda_ext\": $cuda_ext,
+        \"compiler_name\": \"$compiler_name\",
+        \"compiler_exe\": \"$compiler_exe\",
+        \"compiler_version\": \"$compiler_version\",
+        \"internal\": $internal
+    }" | jq -c '.'
+}
+
+readonly cuda99_8_gcc=$( make_compiler_entry "gcc"  "$CUDA99_GCC_VERSION"  "gcc"   "99.8" "false" "true")
+readonly cuda99_9_gcc=$( make_compiler_entry "gcc"  "$CUDA99_GCC_VERSION"  "gcc"   "99.9" "false" "true")
+readonly cuda99_8_llvm=$(make_compiler_entry "llvm" "$CUDA99_LLVM_VERSION" "clang" "99.8" "false" "true")
+readonly cuda99_9_llvm=$(make_compiler_entry "llvm" "$CUDA99_LLVM_VERSION" "clang" "99.9" "false" "true")
 
 readonly all_comb="$combinations $cuda99_9_gcc $cuda99_8_gcc $cuda99_9_llvm $cuda99_8_llvm"
 # For each unique combination
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 2401037a349..f2365bb69a6 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -14,6 +14,7 @@ ci/ @nvidia/cccl-infra-codeowners
 .clang-format @nvidia/cccl-infra-codeowners
 .clangd @nvidia/cccl-infra-codeowners
 c2h/ @nvidia/cccl-infra-codeowners
+nvbench_helper/ @nvidia/cccl-infra-codeowners
 .vscode @nvidia/cccl-infra-codeowners
 
 # cmake
@@ -27,5 +28,6 @@ benchmarks/ @nvidia/cccl-benchmark-codeowners
 **/benchmarks @nvidia/cccl-benchmark-codeowners
 
 # docs
+README.md @nvidia/cccl-codeowners
 docs/ @nvidia/cccl-codeowners
 examples/ @nvidia/cccl-codeowners
diff --git a/.github/actions/workflow-build/action.yml b/.github/actions/workflow-build/action.yml
index 882bb8c8ae0..20755153429 100644
--- a/.github/actions/workflow-build/action.yml
+++ b/.github/actions/workflow-build/action.yml
@@ -88,10 +88,12 @@ runs:
       run: |
         echo "Parsing matrix file into a workflow..."
 
+        echo "::group::Generating GHA workflow from matrix.yaml"
         ${{ env.matrix_parser }} ${{ inputs.matrix_file }}           \
             --workflows ${{ inputs.workflows }}                      \
             ${{ env.allow_override }}                                \
             ${{ env.dirty_projects_flag }} ${{ env.dirty_projects }}
+        echo "::endgroup::"
 
         if [[ -f workflow/override.json ]]; then
           echo "::group::Override matrix"
diff --git a/.github/actions/workflow-build/build-workflow.py b/.github/actions/workflow-build/build-workflow.py
index 4dc358d7426..61a75854426 100755
--- a/.github/actions/workflow-build/build-workflow.py
+++ b/.github/actions/workflow-build/build-workflow.py
@@ -119,10 +119,14 @@ def canonicalize_ctk_version(ctk_string):
     if ctk_string in matrix_yaml["ctk_versions"]:
         return ctk_string
 
-    # Check for aka's:
+    # Check for aliases:
     for ctk_key, ctk_value in matrix_yaml["ctk_versions"].items():
-        if "aka" in ctk_value and ctk_string == ctk_value["aka"]:
-            return ctk_key
+        if "alias" in ctk_value:
+            # Allow a string or list of strings:
+            aliases = ctk_value["alias"]
+            aliases = [aliases] if isinstance(aliases, str) else aliases
+            if ctk_string in aliases:
+                return ctk_key
 
     raise Exception(f"Unknown CTK version '{ctk_string}'")
 
@@ -136,7 +140,19 @@ def get_ctk(ctk_string):
 @memoize_result
 def parse_cxx_string(cxx_string):
     "Returns (id, version) tuple. Version may be None if not present."
-    return re.match(r"^([a-z]+)-?([\d\.]+)?$", cxx_string).groups()
+    # Captures three groups:
+    # 0: The compiler ID (e.g. 'nvhpc' in ['nvhpc', 'nvhpc25.7', 'nvhpc-25.7', 'nvhpc-prev'])
+    # 1: A maybe-hyphenated numeric version suffix (e.g. '10' in ['gcc10', 'gcc-10'])
+    # 2: A hyphenated string alias (e.g. 'prev' in 'nvhpc-prev')
+    #
+    # Either 1, 2, or both may be None.
+    match = re.match(r"^([^\d-]+)(?:(-?[\d\.]+)|-(.+))?$", cxx_string).groups()
+    # Clean up to (id, version):
+    if match[2] is None:
+        return (match[0], match[1])
+    else:
+        return (match[0], match[2])
+    return match
 
 
 @memoize_result
@@ -163,11 +179,15 @@ def canonicalize_host_compiler_name(cxx_string):
             hc_def["versions"].keys(), key=lambda x: tuple(map(int, x.split(".")))
         )
 
-    # Check for aka's:
+    # Check for aliases:
     if version not in hc_def["versions"]:
         for version_key, version_data in hc_def["versions"].items():
-            if "aka" in version_data and version == version_data["aka"]:
-                version = version_key
+            if "alias" in version_data:
+                # Allow a string or list of strings:
+                aliases = version_data["alias"]
+                aliases = [aliases] if isinstance(aliases, str) else aliases
+                if version in aliases:
+                    version = version_key
 
     if version not in hc_def["versions"]:
         raise Exception(f"Unknown version '{version}' for host compiler '{id}'.")
@@ -284,7 +304,11 @@ def get_job_type_info(job):
         result["gpu"] = False
     if "cuda_ext" not in result:
         result["cuda_ext"] = False
-    if "force_producer_ctk" not in result:
+    if "force_producer_ctk" in result:
+        result["force_producer_ctk"] = canonicalize_ctk_version(
+            result["force_producer_ctk"]
+        )
+    else:
         result["force_producer_ctk"] = None
     if "needs" not in result:
         result["needs"] = None
@@ -396,13 +420,8 @@ def generate_dispatch_group_name(matrix_job):
 
 def generate_dispatch_job_name(matrix_job, job_type):
     job_info = get_job_type_info(job_type)
-    ctk = matrix_job["ctk"]
-    std_str = ("C++" + str(matrix_job["std"]) + " ") if "std" in matrix_job else ""
     cpu_str = matrix_job["cpu"]
     gpu_str = (", " + matrix_job["gpu"].upper()) if job_info["gpu"] else ""
-    py_version = (
-        (", py" + matrix_job["py_version"]) if "py_version" in matrix_job else ""
-    )
     cuda_compile_arch = (
         (" sm{" + str(matrix_job["sm"]) + "}") if "sm" in matrix_job else ""
     )
@@ -410,9 +429,16 @@ def generate_dispatch_job_name(matrix_job, job_type):
         (" " + matrix_job["cmake_options"]) if "cmake_options" in matrix_job else ""
     )
 
+    ctk = matrix_job["ctk"]
     host_compiler = get_host_compiler(matrix_job["cxx"])
+    std_str = (" C++" + str(matrix_job["std"])) if "std" in matrix_job else ""
+    py_str = (
+        (" py" + str(matrix_job["py_version"])) if "py_version" in matrix_job else ""
+    )
 
-    config_tag = f"CTK{ctk} {std_str}{host_compiler['name']}{host_compiler['version']}"
+    config_tag = (
+        f"CTK{ctk} {host_compiler['name']}{host_compiler['version']}{std_str}{py_str}"
+    )
 
     extra_info = (
         f":{cuda_compile_arch}{cmake_options}"
@@ -420,9 +446,7 @@ def generate_dispatch_job_name(matrix_job, job_type):
         else ""
     )
 
-    return (
-        f"[{config_tag}] {job_info['name']}({cpu_str}{gpu_str}{py_version}){extra_info}"
-    )
+    return f"[{config_tag}] {job_info['name']}({cpu_str}{gpu_str}){extra_info}"
 
 
 def generate_dispatch_job_runner(matrix_job, job_type):
@@ -513,23 +537,29 @@ def generate_dispatch_job_origin(matrix_job, job_type):
 
     job_info = get_job_type_info(job_type)
 
+    # Replace the unexploded 'jobs' tag with the current single job type:
+    origin_job["jobs"] = [job_info["id"]]
+
     # The origin tags are used to build the execution summary for the CI PR comment.
     # Use the human readable job label for the execution summary:
-    origin_job["jobs"] = job_info["name"]
+    origin_job["job_name"] = job_info["name"]
+
+    if not job_info["gpu"]:
+        del origin_job["gpu"]
 
     # Replace some of the clunkier tags with a summary-friendly version:
     if "cxx" in origin_job:
         host_compiler = get_host_compiler(matrix_job["cxx"])
         del origin_job["cxx"]
 
-        origin_job["cxx"] = host_compiler["name"] + host_compiler["version"]
+        origin_job["cxx"] = host_compiler["id"] + host_compiler["version"]
         origin_job["cxx_family"] = host_compiler["name"]
 
     if "cudacxx" in origin_job:
         device_compiler = get_device_compiler(matrix_job)
         del origin_job["cudacxx"]
 
-        origin_job["cudacxx"] = device_compiler["name"] + device_compiler["version"]
+        origin_job["cudacxx"] = device_compiler["id"] + device_compiler["version"]
         origin_job["cudacxx_family"] = device_compiler["name"]
 
     origin["matrix_job"] = origin_job
@@ -1243,6 +1273,9 @@ def print_gha_workflow(args):
 def print_devcontainer_info(args):
     devcontainer_version = matrix_yaml["devcontainer_version"]
 
+    cuda99_gcc_version = matrix_yaml["cuda99_gcc_version"]
+    cuda99_clang_version = matrix_yaml["cuda99_clang_version"]
+
     matrix_jobs = []
 
     # Remove the `exclude` and `override` entries:
@@ -1253,14 +1286,19 @@ def print_devcontainer_info(args):
     for workflow_name in workflow_names:
         matrix_jobs.extend(parse_workflow_matrix_jobs(args, workflow_name))
 
+    # Explode jobs to ensure that the cuda_ext tags are correctly handled:
+    exploded_jobs = []
+    for matrix_job in matrix_jobs:
+        exploded_jobs.extend(explode_tags(matrix_job, "jobs"))
+    matrix_jobs = exploded_jobs
+
     # Check if the extended cuda images are needed:
     for matrix_job in matrix_jobs:
         cuda_ext = False
-        for job in matrix_job["jobs"]:
-            job_info = get_job_type_info(job)
-            if job_info["cuda_ext"]:
-                cuda_ext = True
-                break
+        job = matrix_job["jobs"]
+        job_info = get_job_type_info(job)
+        if job_info["cuda_ext"]:
+            cuda_ext = True
         matrix_job["cuda_ext"] = cuda_ext
 
     # Remove all but the following keys from the matrix jobs:
@@ -1285,6 +1323,8 @@ def print_devcontainer_info(args):
 
     devcontainer_json = {
         "devcontainer_version": devcontainer_version,
+        "cuda99_gcc_version": cuda99_gcc_version,
+        "cuda99_clang_version": cuda99_clang_version,
         "combinations": unique_combinations,
     }
 
diff --git a/.github/actions/workflow-results/prepare-execution-summary.py b/.github/actions/workflow-results/prepare-execution-summary.py
index aa932775ea0..f3304b17536 100755
--- a/.github/actions/workflow-results/prepare-execution-summary.py
+++ b/.github/actions/workflow-results/prepare-execution-summary.py
@@ -123,7 +123,10 @@ def build_summary(jobs, job_times=None):
         update_summary_entry(projects[project], job, job_times)
 
         for tag in matrix_job.keys():
-            if tag == "project":
+            # These are excluded from the summary table:
+            # - Project is already the top-level grouping.
+            # - Human-readable 'job_name' is used in place of 'jobs'.
+            if tag in ["project", "jobs"]:
                 continue
 
             if tag not in tags:
diff --git a/.github/actions/workflow-run-job-linux/action.yml b/.github/actions/workflow-run-job-linux/action.yml
index 543b1b00712..f9420efdfee 100644
--- a/.github/actions/workflow-run-job-linux/action.yml
+++ b/.github/actions/workflow-run-job-linux/action.yml
@@ -45,6 +45,7 @@ runs:
         echo "::endgroup::"
 
     - name: Add NVCC problem matcher
+      continue-on-error: true
       shell: bash --noprofile --norc -euo pipefail {0}
       run: |
         echo "::add-matcher::${{github.workspace}}/.github/problem-matchers/problem-matcher.json"
@@ -54,6 +55,28 @@ runs:
         role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-NVIDIA
         aws-region: us-east-2
         role-duration-seconds: 43200 # 12 hours
+    - name: Print CI override matrix job def
+      env:
+        GH_TOKEN: ${{ github.token }}
+      continue-on-error: true
+      shell: bash --noprofile --norc -euo pipefail {0}
+      run: |
+        # Get the origin matrix job definition embedded in thw workflow artifact:
+        matrix_job=$(ci/util/workflow/get_job_def.sh | jq -c '.origin.matrix_job')
+
+        # Delete the cxx_family and cudacxx_family fields
+        matrix_job=$(echo "$matrix_job" | jq 'del(.cxx_family, .cudacxx_family, .job_name)')
+
+        # Convert to a single line of YAML, with unquoted keys:
+        matrix_job=$(
+          echo "- $matrix_job" |
+            yq -o=yaml |
+            sed -E 's/"([[:alnum:]_]+)"([[:space:]]*):/\1\2:/g' |
+            tr '"' "'"
+        )
+
+        echo -e "\e[1;34mOverride matrix entry:\e[0m"
+        echo -e "\e[1;34m    $matrix_job\e[0m"
     - name: Run command # Do not change this step's name, it is checked in parse-job-times.py
       id: run
       shell: bash --noprofile --norc -euo pipefail {0}
diff --git a/.github/workflows/build-matx.yml b/.github/workflows/build-matx.yml
index 2e8bd345521..1485e47c62b 100644
--- a/.github/workflows/build-matx.yml
+++ b/.github/workflows/build-matx.yml
@@ -92,7 +92,7 @@ jobs:
           .devcontainer/launch.sh \
             --docker \
             --cuda 12.9 \
-            --host gcc13 \
+            --host gcc14 \
             --cuda-ext \
             --env "CCCL_TAG=${CCCL_TAG}" \
             --env "CCCL_VERSION=${CCCL_VERSION}" \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3cf3f78478e..0830d733b32 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,9 +25,13 @@ if (NOT CCCL_TOPLEVEL_PROJECT)
   include(cmake/CCCLAddSubdir.cmake)
 endif()
 
-# We require a higher cmake version for dev builds
 if (CCCL_TOPLEVEL_PROJECT)
+  # We require a higher cmake version for dev builds
   cmake_minimum_required(VERSION 3.21)
+
+  # Handle special CCCL values for CMAKE_CUDA_ARCHITECTURES
+  include(cmake/CCCLCheckCudaArchitectures.cmake)
+  cccl_check_cuda_architectures()
 endif()
 
 option(CCCL_ENABLE_LIBCUDACXX "Enable the libcu++ developer build." OFF)
diff --git a/CMakePresets.json b/CMakePresets.json
index b39ab345fc9..537e7ebc88a 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -13,7 +13,7 @@
       "binaryDir": "${sourceDir}/build/$env{CCCL_BUILD_INFIX}/${presetName}",
       "cacheVariables": {
         "CMAKE_BUILD_TYPE": "Release",
-        "CMAKE_CUDA_ARCHITECTURES": "60;70;80",
+        "CMAKE_CUDA_ARCHITECTURES": "all-major-cccl",
         "CCCL_ENABLE_UNSTABLE": true,
         "CCCL_ENABLE_LIBCUDACXX": false,
         "CCCL_ENABLE_CUB": false,
@@ -55,6 +55,7 @@
       "name": "all-dev",
       "inherits": "base",
       "cacheVariables": {
+        "CMAKE_CUDA_ARCHITECTURES": "native",
         "CCCL_ENABLE_LIBCUDACXX": true,
         "CCCL_ENABLE_CUB": true,
         "CCCL_ENABLE_THRUST": true,
@@ -95,6 +96,7 @@
       "displayName": "all-dev debug",
       "inherits": "all-dev",
       "cacheVariables": {
+        "CMAKE_CUDA_ARCHITECTURES": "native",
         "CMAKE_BUILD_TYPE": "Debug",
         "CMAKE_CUDA_FLAGS": "-G",
         "CCCL_ENABLE_BENCHMARKS": false,
diff --git a/c2h/include/c2h/catch2_test_helper.h b/c2h/include/c2h/catch2_test_helper.h
index efd71918c0d..94e65f6b58b 100644
--- a/c2h/include/c2h/catch2_test_helper.h
+++ b/c2h/include/c2h/catch2_test_helper.h
@@ -36,6 +36,14 @@
 #  define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma("diag push")
 #  define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION  _Pragma("diag pop")
 #endif
+// The nv_diagnostic pragmas in Catch2 macros cause cicc to hang indefinitely in CTK 13.0.
+// See NVBugs 5475335.
+#if _CCCL_VERSION_COMPARE(_CCCL_CTK_, _CCCL_CTK, ==, 13, 0)
+#  undef CATCH_INTERNAL_START_WARNINGS_SUPPRESSION
+#  undef CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+#  define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION
+#  define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+#endif
 // workaround for error
 // * MSVC14.39: #3185-D: no '#pragma diagnostic push' was found to match this 'diagnostic pop'
 // * MSVC14.29: internal error: assertion failed: alloc_copy_of_pending_pragma: copied pragma has source sequence entry
diff --git a/ci/build_common.sh b/ci/build_common.sh
index 0743a740162..a649b0170a9 100755
--- a/ci/build_common.sh
+++ b/ci/build_common.sh
@@ -96,13 +96,29 @@ if [ -z ${CCCL_BUILD_INFIX+x} ]; then
     CCCL_BUILD_INFIX=""
 fi
 
-# Presets will be configured in this directory:
-BUILD_DIR="../build/${CCCL_BUILD_INFIX}"
+mkdir -p ../build
+# Absolute path to cccl/build
+BUILD_ROOT=$(cd "../build" && pwd)
 
-# The most recent build will always be symlinked to cccl/build/latest
+# Absolute path to per-devcontainer build directory
+BUILD_DIR="$BUILD_ROOT/$CCCL_BUILD_INFIX"
+
+# The most recent devcontainer build dir will always be symlinked to cccl/build/latest
 mkdir -p $BUILD_DIR
-rm -f ../build/latest
-ln -sf $BUILD_DIR ../build/latest
+rm -f $BUILD_ROOT/latest
+ln -sf $BUILD_DIR $BUILD_ROOT/latest
+
+# The more recent preset build dir will always be symlinked to:
+# cccl/build/latest/latest
+# cccl/preset-latest
+function symlink_latest_preset {
+    local PRESET=$1
+    mkdir -p "$BUILD_DIR/$PRESET"
+    rm -f "$BUILD_ROOT/latest/latest"
+    ln -sf "$BUILD_DIR/$PRESET" "$BUILD_ROOT/latest/latest"
+    rm -f "$BUILD_ROOT/preset-latest"
+    ln -sf "$BUILD_DIR/$PRESET" "$BUILD_ROOT/preset-latest"
+}
 
 # Now that BUILD_DIR exists, use readlink to canonicalize the path:
 BUILD_DIR=$(readlink -f "${BUILD_DIR}")
@@ -192,6 +208,8 @@ function configure_preset()
     local CMAKE_OPTIONS=$3
     local GROUP_NAME="🛠️  CMake Configure ${BUILD_NAME}"
 
+    symlink_latest_preset "$PRESET"
+
     pushd .. > /dev/null
     if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
       # Retry 5 times with 30 seconds between attempts to try to WAR network issues during CPM fetch on CI runners:
@@ -223,6 +241,8 @@ function build_preset() {
     local red="1;31"
     local GROUP_NAME="🏗️  Build ${BUILD_NAME}"
 
+    symlink_latest_preset "$PRESET"
+
     if $CONFIGURE_ONLY; then
         return 0
     fi
@@ -270,6 +290,8 @@ function test_preset()
     local PRESET=$2
     local GPU_REQUIRED=${3:-true}
 
+    symlink_latest_preset "$PRESET"
+
     if $CONFIGURE_ONLY; then
         return 0
     fi
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 3f003633eda..e3f31ec78c5 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -5,138 +5,242 @@ workflows:
   #
   # Example:
   # override:
-  #   - {jobs: ['test'], project: 'thrust', std: 17, ctk: 'curr', cxx: ['gcc12', 'clang16']}
+  #   - {jobs: ['test'], project: 'thrust', std: 17, ctk: '12.X', cxx: ['gcc12', 'clang16']}
   #
   override:
 
   pull_request:
-    # Old CTK/compiler
-    - {jobs: ['build'], std: 'minmax', ctk: '12.0', cxx: ['gcc7', 'gcc9', 'clang14', 'msvc2019']}
-    # Current CTK build-only
-    - {jobs: ['build'], std: 'max', cxx: ['gcc7', 'gcc8', 'gcc9']}
-    - {jobs: ['build'], std: 'all', cxx: ['gcc10', 'gcc11', 'gcc12']}
-    - {jobs: ['build'], std: 'all', cxx: ['clang14', 'clang15', 'clang16', 'clang17', 'clang18']}
-    - {jobs: ['build'], std: 'max', cxx: ['msvc2019']}
-    - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang', 'msvc']}
+    # Old CTK: Oldest/newest supported host compilers:
+    - {jobs: ['build'], std: 'minmax', ctk: '12.0', cxx: ['gcc7',  'gcc12', 'clang14',          'msvc2019', 'msvc14.39']}
+    - {jobs: ['build'], std: 'minmax', ctk: '12.X', cxx: ['gcc7',  'gcc',   'clang14', 'clang', 'msvc2019', 'msvc'     ]}
+    - {jobs: ['build'], std: 'minmax', ctk: '13.0', cxx: ['gcc11', 'gcc',   'clang15', 'clang', 'msvc2019', 'msvc'     ]}
+    # Old CTK: cudax has a different support matrix:
+    - {jobs: ['build'], project: 'cudax', ctk: '12.0', std: 'minmax', cxx: ['gcc9',  'gcc12', 'clang14',          'msvc14.39']}
+    - {jobs: ['build'], project: 'cudax', ctk: '12.X', std: 'minmax', cxx: ['gcc9',  'gcc',   'clang14', 'clang', 'msvc']}
+    - {jobs: ['build'], project: 'cudax', ctk: '13.0', std: 'minmax', cxx: ['gcc11', 'gcc',   'clang15', 'clang', 'msvc']}
+    # Current CTK build-only:
+    - {jobs: ['build'], std: 'minmax', cxx: ['gcc11', 'clang15', 'msvc2019'] } # Oldest
+    - {jobs: ['build'], std: 'max',    cxx: ['gcc12', 'gcc13'] }
+    - {jobs: ['build'], std: 'max',    cxx: ['clang16', 'clang17'] }
+    - {jobs: ['build'], std: 'all',    cxx: ['gcc', 'clang', 'msvc']} # Latest
+    # Current CTK build-only: cudax has a different support matrix:
+    - {jobs: ['build'], project: 'cudax', std: 'minmax', cxx: ['gcc11', 'clang15']} # Oldest
+    - {jobs: ['build'], project: 'cudax', std: 'max',    cxx: ['gcc12']}
+    - {jobs: ['build'], project: 'cudax', std: 'max',    cxx: ['clang16', 'clang17', 'clang18']}
+    - {jobs: ['build'], project: 'cudax', std: 'all',    cxx: ['gcc', 'clang', 'msvc']} # Newest
     # Current CTK testing:
-    - {jobs: ['test'],  project: ['thrust'],     std: 'max', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx4090'}
-    - {jobs: ['test'],  project: ['libcudacxx'], std: 'max', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx2080'}
-    # Split up cub tests:
-    - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['gcc'],           gpu: 'rtxa6000'}
-    - {jobs: ['test_lid1', 'test_lid2'],  project: ['cub'], std: 'max', cxx: ['gcc'],           gpu: 'rtxa6000'}
-    - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['clang', 'msvc'], gpu: 'rtxa6000'}
-    # Modded builds:
-    - {jobs: ['build'], std: 'all', ctk: '12.9', cxx: 'nvhpc'}
-    - {jobs: ['build'], std: 'max', cxx: ['gcc', 'clang'], cpu: 'arm64'}
-    - {jobs: ['build'], std: 'max', cxx: ['gcc', 'msvc'],  sm: ['90;90a', '100;120']}
-    - {jobs: ['test_nolid', 'test_lid0'], project: 'cub',                   std: 'max', gpu: 'h100', sm: 'gpu' }
-    - {jobs: ['test_gpu'],                project: 'thrust',                std: 'max', gpu: 'h100', sm: 'gpu' }
-    - {jobs: ['test'],                    project: ['libcudacxx', 'cudax'], std: 'max', gpu: 'h100', sm: 'gpu' }
-    # Test Thrust 32-bit-only dispatch here, since it's most likely to break. 64-bit-only is tested in nightly.
-    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 'rtx4090'}
-    # default_projects: clang-cuda
-    - {jobs: ['build'], std: 'all', cudacxx: 'clang', cxx: 'clang'}
-    # Disabled; see discussion on #3633. Should be fixed in clang-20.
-#    - {jobs: ['build'], project: 'libcudacxx', std: 'max', cudacxx: 'clang', cxx: 'clang', sm: '90'}
-#    - {jobs: ['build'], project: 'libcudacxx', std: 'max', cudacxx: 'clang', cxx: 'clang', sm: '90a'}
-    # nvrtc:
+    - {jobs: ['test'], project: 'thrust',     std: 'max', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx4090'}
+    - {jobs: ['test'], project: 'libcudacxx', std: 'max', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx2080'}
+    - {jobs: ['test'], project: 'cudax',      std: 'max', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx2080'}
+    - {jobs: ['test_nolid', 'test_lid0'], project: 'cub', std: 'max', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtxa6000'}
+    - {jobs: ['test_lid1',  'test_lid2'], project: 'cub', std: 'max', cxx: ['gcc'],                  gpu: 'rtxa6000'}
+    # H100 coverage:
+    - {jobs: ['test_nolid', 'test_lid0'], project: 'cub',                   std: 'max', gpu: 'h100' }
+    - {jobs: ['test_gpu'],                project: 'thrust',                std: 'max', gpu: 'h100' }
+    - {jobs: ['test'],                    project: ['libcudacxx', 'cudax'], std: 'max', gpu: 'h100' }
+    # Misc:
+    - {jobs: ['build'], cpu: 'arm64', project: ['libcudacxx', 'cub', 'thrust', 'cudax'], std: 'max', cxx: ['gcc', 'clang']}
+    - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 'rtx4090'}
     - {jobs: ['nvrtc'], project: 'libcudacxx', std: 'all', gpu: 'rtx2080', sm: 'gpu'}
-    # verify-codegen:
     - {jobs: ['verify_codegen'], project: 'libcudacxx'}
-    # cudax has different CTK reqs:
-    - {jobs: ['build'], project: 'cudax', ctk: ['12.0'], std: 20,    cxx: ['msvc14.39', 'gcc10', 'clang14']}
-    - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20,    cxx: ['gcc10', 'gcc11', 'gcc12']}
-    - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20,    cxx: ['clang14', 'clang15', 'clang16', 'clang17', 'clang18']}
-    - {jobs: ['build'], project: 'cudax', ctk: ['12.9'], std: 'all', cxx: ['nvhpc']}
-    - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20,    cxx: ['msvc']}
-    - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20,    cxx: ['gcc', 'msvc'], sm: ['90;90a', '100;120']}
-    - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'}
-    - {jobs: ['test'],  project: 'cudax', ctk: ['curr'], std: 20,    cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx2080'}
-    # Python and c/parallel jobs:
-    - {jobs: ['test'], project: ['cccl_c_parallel'], gpu: ['rtx2080', 'l4', 'h100']}
-    # TODO Just need this line once cccl.parallel tests pass on 12.5 and 12.6:
-    # - {jobs: ['test'], project: 'python', ctk: ['12.5', 'curr'], py_version: ['3.10', '3.13'], gpu: 'l4'}
-    # These two can be removed once the above is working:
-    - {jobs: ['test_py_headers', 'test_py_coop', 'test_py_examples'], ctk: ['12.5', 'curr'], project: 'python', py_version: ['3.10', '3.13'], gpu: 'l4'}
-    - {jobs: ['test_py_par'],                                         ctk: ['12.8', 'curr'], project: 'python', py_version: ['3.10', '3.13'], gpu: 'l4'}
-    - {jobs: ['test'], project: 'python', py_version: '3.13', gpu: 'h100'}
-    # packaging:
-    - {jobs: ['test'], project: 'packaging', ctk: '12.0', cxx: ['gcc12', 'clang14'], gpu: 'rtx2080'}
-    - {jobs: ['test'], project: 'packaging', ctk: 'curr', cxx: ['gcc',   'clang'],   gpu: 'rtx2080'}
+    # c.parallel -- pinned to gcc13 to match python
+    - {jobs: ['test'], project: 'cccl_c_parallel', ctk: '12.X', cxx: 'gcc13', gpu: ['rtx2080']}
+    - {jobs: ['test'], project: 'cccl_c_parallel', ctk: '13.X', cxx: 'gcc13', gpu: ['rtx2080', 'l4', 'h100']}
+    # Python -- pinned to gcc13 for consistency across CTK images
+    - {jobs: ['test_py_headers', 'test_py_coop'], ctk: ['12.5', '13.X'], project: 'python', py_version: ['3.10', '3.13'], gpu: 'l4', cxx: 'gcc13'}
+    - {jobs: ['test_py_par', 'test_py_examples'], ctk: ['12.8', '13.X'], project: 'python', py_version: ['3.10', '3.13'], gpu: 'l4', cxx: 'gcc13'}
+    - {jobs: ['test'], project: 'python', py_version: '3.13', gpu: 'h100', cxx: 'gcc13'}
+    # CCCL packaging:
+    - {jobs: ['test'], project: 'packaging', ctk: '12.0', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080'}
+    - {jobs: ['test'], project: 'packaging', ctk: '12.X', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080'}
+    - {jobs: ['test'], project: 'packaging', ctk: '13.0', cxx: ['gcc',   'clang'],   gpu: 'rtx2080'}
+    - {jobs: ['test'], project: 'packaging', ctk: '13.X', cxx: ['gcc',   'clang'],   gpu: 'rtx2080'}
     - {jobs: ['install'], project: 'packaging'}
-    # NVHPC stdpar smoke tests
-    - {jobs: ['build'], project: 'stdpar', std: 'all', ctk: '12.9', cxx: 'nvhpc', cpu: ['amd64', 'arm64']}
+    # NVHPC build
+    - {jobs: ['build'], cxx: 'nvhpc', ctk: 'nvhpc', std: 'all', project: ['libcudacxx', 'cub', 'thrust', 'cudax', 'stdpar'], cpu: ['amd64', 'arm64']}
+    # clang-cuda
+    # - Can't add sm90+ until clang20 (#3633)
+    - {jobs: ['build'], cudacxx: 'clang', ctk: '12.X', std: 'all', cxx: 'clang', sm: '75;80'}
+    # - CTK 13.X unsupported: https://gitlab.kitware.com/cmake/cmake/-/merge_requests/11079#note_1692019
+    # - {jobs: ['build'], cudacxx: 'clang', ctk: '13.X', std: 'all', cxx: 'clang', sm: '75;80'}
 
   nightly:
-    # Edge-case jobs
+    # CTK 12.0 full matrix build: default projects
+    - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc7', 'gcc8', 'gcc9', 'gcc10', 'gcc11', 'gcc12']}
+    - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['clang14']}
+    - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['msvc2019', 'msvc14.39']}
+    # CTK 12.X full matrix build: default projects
+    - {jobs: ['build'], std: 'all', ctk: '12.X', cxx: ['gcc7', 'gcc8', 'gcc9', 'gcc10', 'gcc11', 'gcc12', 'gcc13', 'gcc14']}
+    - {jobs: ['build'], std: 'all', ctk: '12.X', cxx: ['clang14', 'clang15', 'clang16', 'clang17', 'clang18', 'clang19']}
+    - {jobs: ['build'], std: 'all', ctk: '12.X', cxx: ['msvc2019', 'msvc2022']}
+    # CTK 13.0 full matrix build: default projects
+    - {jobs: ['build'], std: 'all', ctk: '13.0', cxx: ['gcc11', 'gcc12', 'gcc13', 'gcc14']}
+    - {jobs: ['build'], std: 'all', ctk: '13.0', cxx: ['clang15', 'clang16', 'clang17', 'clang18', 'clang19']}
+    - {jobs: ['build'], std: 'all', ctk: '13.0', cxx: ['msvc2019', 'msvc2022']}
+    # CTK '13.X' full matrix build: default projects
+    - {jobs: ['build'], std: 'all', ctk: '13.X', cxx: ['gcc11', 'gcc12', 'gcc13', 'gcc14']}
+    - {jobs: ['build'], std: 'all', ctk: '13.X', cxx: ['clang15', 'clang16', 'clang17', 'clang18', 'clang19']}
+    - {jobs: ['build'], std: 'all', ctk: '13.X', cxx: ['msvc2019', 'msvc2022']}
+    # CTK 12.0 full matrix build: cudax
+    - {jobs: ['build'], project: 'cudax', std: 'all', ctk: '12.0', cxx: ['gcc9', 'gcc10', 'gcc11', 'gcc12']}
+    - {jobs: ['build'], project: 'cudax', std: 'all', ctk: '12.0', cxx: ['clang14']}
+    - {jobs: ['build'], project: 'cudax', std: 'all', ctk: '12.0', cxx: ['msvc14.39']}
+    # CTK 12.X full matrix build: cudax
+    - {jobs: ['build'], project: 'cudax', std: 'all', ctk: '12.X', cxx: ['gcc9', 'gcc10', 'gcc11', 'gcc12', 'gcc13', 'gcc14']}
+    - {jobs: ['build'], project: 'cudax', std: 'all', ctk: '12.X', cxx: ['clang14', 'clang15', 'clang16', 'clang17', 'clang18', 'clang19']}
+    - {jobs: ['build'], project: 'cudax', std: 'all', ctk: '12.X', cxx: ['msvc2022']}
+    # CTK 13.0 full matrix build: cudax
+    - {jobs: ['build'], project: 'cudax', std: 'all', ctk: '13.0', cxx: ['gcc11', 'gcc12', 'gcc13', 'gcc14']}
+    - {jobs: ['build'], project: 'cudax', std: 'all', ctk: '13.0', cxx: ['clang15', 'clang16', 'clang17', 'clang18', 'clang19']}
+    - {jobs: ['build'], project: 'cudax', std: 'all', ctk: '13.0', cxx: ['msvc2022']}
+    # CTK '13.X' full matrix build: cudax
+    - {jobs: ['build'], project: 'cudax', std: 'all', ctk: '13.X', cxx: ['gcc11', 'gcc12', 'gcc13', 'gcc14']}
+    - {jobs: ['build'], project: 'cudax', std: 'all', ctk: '13.X', cxx: ['clang15', 'clang16', 'clang17', 'clang18', 'clang19']}
+    - {jobs: ['build'], project: 'cudax', std: 'all', ctk: '13.X', cxx: ['msvc2022']}
+    # CTK 12.X testing:
+    - {jobs: ['test'], project: 'libcudacxx', ctk: '12.X', std: 'max', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx2080'}
+    - {jobs: ['test'], project: 'cub',        ctk: '12.X', std: 'max', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtxa6000'}
+    - {jobs: ['test'], project: 'thrust',     ctk: '12.X', std: 'max', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx4090'}
+    - {jobs: ['test'], project: 'cudax',      ctk: '12.X', std: 'max', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx2080'}
+    - {jobs: ['test'], project: ['libcudacxx', 'cub', 'thrust', 'cudax'], ctk: '12.X', std: 'max', gpu: 'h100' }
+    # CTK '13.X' testing:
+    - {jobs: ['test'], project: 'libcudacxx', ctk: '13.X', std: 'max', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx2080'}
+    - {jobs: ['test'], project: 'cub',        ctk: '13.X', std: 'max', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtxa6000'}
+    - {jobs: ['test'], project: 'thrust',     ctk: '13.X', std: 'max', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx4090'}
+    - {jobs: ['test'], project: 'cudax',      ctk: '13.X', std: 'max', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx2080'}
+    - {jobs: ['test'], project: ['libcudacxx', 'cub', 'thrust', 'cudax'], ctk: '13.X', std: 'max', gpu: 'h100' }
+    # Misc:
+    - {jobs: ['build'], cpu: 'arm64', project: ['libcudacxx', 'cub', 'thrust', 'cudax'], ctk: ['12.X', '13.X'], std: 'all', cxx: ['gcc', 'clang']}
+    # Coming in a later PR after some logistical issues with `-arch all` and RAM usage are resolved:
+    # - {jobs: ['build'], sm: 'all-cccl', project: ['cub', 'thrust', 'libcudacxx', 'cudax'], ctk: ['12.X', '13.X'], std: 'all', cxx: ['gcc', 'msvc'] }
+    - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 'rtx4090'}
+    - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit', gpu: 'rtx4090'}
     - {jobs: ['limited'], project: 'cub', std: 17, gpu: 'rtx2080'}
-    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 'rtx4090'}
-    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit', gpu: 'rtx4090'}
-    # Old CTK/compiler
-    - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc7', 'gcc8', 'gcc9', 'clang14', 'msvc2019']}
-    - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc11'], sm: '60;70;80;90'}
-    # Current CTK build-only
-    - {jobs: ['build'], std: 'all', cxx: ['gcc7', 'gcc8', 'gcc9', 'gcc10', 'gcc11', 'gcc12']}
-    - {jobs: ['build'], std: 'all', cxx: ['clang14', 'clang15', 'clang16', 'clang17', 'clang18']}
-    - {jobs: ['build'], std: 'all', cxx: ['msvc2019']}
-    # Test current CTK
-    - {jobs: ['test'],      project: 'cub',        std: 'all', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtxa6000'}
-    - {jobs: ['test'],      project: 'thrust',     std: 'all', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx4090'}
-    - {jobs: ['test'],      project: 'libcudacxx', std: 'all', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx2080'}
-    # Modded builds:
-    - {jobs: ['build'], std: 'all', ctk: '12.9', cxx: 'nvhpc'}
-    - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'}
-
-    # Extended GPU tests:
-    - {jobs: ['test_nolid', 'test_lid0'], project: 'cub',                   std: 'max', sm: '70;75;89;90', gpu: ['v100', 't4', 'l4', 'h100']}
-    - {jobs: ['test_gpu'],                project: 'thrust',                std: 'max', sm: '70;75;89;90', gpu: ['v100', 't4', 'l4', 'h100']}
-    - {jobs: ['test'],                    project: ['libcudacxx', 'cudax'], std: 'max', sm: '70;75;89;90', gpu: ['v100', 't4', 'l4', 'h100']}
-    # MSVC build coverage of extended GPU nightly tests:
-    - {jobs: ['build'], project: ['cub', 'thrust', 'libcudacxx', 'cudax'],  std: 'max', sm: '70;75;89;90',                cxx: 'msvc'}
-    # Build-only coverage of extended arches. `sm` split up to parallelize.
-    - {jobs: ['build'], project: ['cub', 'thrust', 'libcudacxx', 'cudax'],  std: 'max', sm: ['86;90a', '100;103', '120'], cxx: ['msvc', 'gcc'] }
-
-    # default_projects: clang-cuda
-    - {jobs: ['build'], std: 'all', cudacxx: 'clang', cxx: 'clang'}
-    # Disabled; see discussion on #3633. Should be fixed in clang-20.
-    # - {jobs: ['build'], project: 'libcudacxx', std: 'all', cudacxx: 'clang', cxx: 'clang', sm: '90'}
-    # - {jobs: ['build'], project: 'libcudacxx', std: 'all', cudacxx: 'clang', cxx: 'clang', sm: '90a'}
-    # cudax
-    - {jobs: ['build'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc9', 'gcc10', 'gcc11', 'gcc12']}
-    - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['clang14', 'clang15', 'clang16', 'clang17', 'clang18']}
-    - {jobs: ['build'], project: 'cudax', ctk: [        '12.9'], std: 'all', cxx: ['nvhpc']}
-    - {jobs: ['build'], project: 'cudax', ctk: ['12.0'        ], std: '20',  cxx: ['msvc14.39']}
-    - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: '20',  cxx: ['msvc']}
-    - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: '20',  cxx: ['gcc', 'msvc'], sm: ['90;90a', '100;120']}
-    - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'}
-    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12']  , gpu: 'rtx2080'}
-    - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['gcc']  ,   gpu: 'rtx2080'}
-    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0'        ], std: 'all', cxx: ['clang14'], gpu: 'rtx2080'}
-    - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['clang'],   gpu: 'rtx2080'}
-    # Python and c/parallel jobs:
-    - {jobs: ['test'], project: ['cccl_c_parallel'], gpu: ['rtx2080', 'l4', 'h100']}
-    # TODO Just need this line once cccl.parallel tests pass on 12.5 and 12.6:
-    # - {jobs: ['test'], project: 'python', ctk: ['12.5', '12.6', '12.8', '12.9'], py_version: ['3.10', '3.11', '3.12', '3.13'], gpu: 'l4'}
-    # These two can be removed once the above is working.
-    - {jobs: ['test_py_headers', 'test_py_coop', 'test_py_examples'], ctk: ['12.5', '12.6', '12.8', '12.9'], py_version: ['3.10', '3.11', '3.12', '3.13'], project: 'python', gpu: 'l4'}
-    - {jobs: ['test_py_par'],                                         ctk: [                '12.8', '12.9'], py_version: ['3.10', '3.11', '3.12', '3.13'], project: 'python', gpu: 'l4'}
-    - {jobs: ['test'], project: 'python', py_version: '3.13', gpu: 'h100'}
-    # packaging:
-    - {jobs: ['test'], project: 'packaging', ctk: '12.0', cxx: ['gcc12', 'clang14'], gpu: 'rtx2080'}
-    - {jobs: ['test'], project: 'packaging', ctk: 'curr', cxx: ['gcc',   'clang'],   gpu: 'rtx2080'}
+    # NVRTC tests don't currently support 12.0:
+    - {jobs: ['nvrtc'],          project: 'libcudacxx', ctk: [        '12.X', '13.0', '13.X'], cxx: 'gcc12', std: 'all', gpu: 'rtx2080', sm: 'gpu'}
+    - {jobs: ['verify_codegen'], project: 'libcudacxx', ctk: ['12.0', '12.X', '13.0', '13.X'], cxx: 'gcc12'}
+    # c.parallel -- pinned to gcc13 to match python
+    - {jobs: ['test'],  project: ['cccl_c_parallel'], ctk: '12.X', cxx: 'gcc13', gpu: ['rtx2080']}
+    - {jobs: ['test'],  project: ['cccl_c_parallel'], ctk: '13.X', cxx: 'gcc13', gpu: ['rtx2080', 'l4', 'h100']}
+    # Python -- pinned to gcc13 for consistency across CTK images
+    - {jobs: ['test_py_headers', 'test_py_coop'], project: 'python', ctk: ['12.5', '12.6', '12.8', '12.9', '13.0'], py_version: ['3.10', '3.13'], gpu: 'l4', cxx: 'gcc13'}
+    - {jobs: ['test_py_par', 'test_py_examples'], project: 'python', ctk: [                '12.8', '12.9', '13.0'], py_version: ['3.10', '3.13'], gpu: 'l4', cxx: 'gcc13'}
+    - {jobs: ['test'], project: 'python', ctk: ['12.X', '13.X'], py_version: ['3.10', '3.11', '3.12', '3.13'], gpu: 'l4', cxx: 'gcc13'}
+    - {jobs: ['test'], project: 'python', ctk: ['12.X', '13.X'], py_version: '3.13', gpu: 'h100', cxx: 'gcc13'}
+    # CCCL packaging:
+    - {jobs: ['test'], project: 'packaging', ctk: '12.0', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080'}
+    - {jobs: ['test'], project: 'packaging', ctk: '12.X', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080'}
+    - {jobs: ['test'], project: 'packaging', ctk: '13.0', cxx: ['gcc',   'clang'],   gpu: 'rtx2080'}
+    - {jobs: ['test'], project: 'packaging', ctk: '13.X', cxx: ['gcc',   'clang'],   gpu: 'rtx2080'}
     - {jobs: ['install'], project: 'packaging'}
-    # NVHPC stdpar smoke tests
-    - {jobs: ['build'], project: 'stdpar', std: 'all', ctk: '12.9', cxx: 'nvhpc', cpu: ['amd64', 'arm64']}
+    # NVHPC build
+    - {jobs: ['build'], cxx: 'nvhpc-prev', ctk: 'nvhpc-prev', std: 'all', project: ['libcudacxx', 'cub', 'thrust', 'cudax', 'stdpar'], cpu: ['amd64', 'arm64']}
+    - {jobs: ['build'], cxx: 'nvhpc',      ctk: 'nvhpc',      std: 'all', project: ['libcudacxx', 'cub', 'thrust', 'cudax', 'stdpar'], cpu: ['amd64', 'arm64']}
+    # clang-cuda
+    # - Can't add sm90+ until clang20 (#3633)
+    - {jobs: ['build'], cudacxx: 'clang', ctk: '12.X', std: 'all', cxx: 'clang', sm: '75;80'}
+    # - CTK 13.X unsupported: https://gitlab.kitware.com/cmake/cmake/-/merge_requests/11079#note_1692019
+    # - {jobs: ['build'], cudacxx: 'clang', ctk: '13.X', std: 'all', cxx: 'clang', sm: '75;80'}
 
   weekly:
+    # CTK 12.0 full matrix build: default projects
+    - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc7', 'gcc8', 'gcc9', 'gcc10', 'gcc11', 'gcc12']}
+    - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['clang14']}
+    - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['msvc2019', 'msvc14.39']}
+    # CTK 12.X full matrix build: default projects
+    - {jobs: ['build'], std: 'all', ctk: '12.X', cxx: ['gcc7', 'gcc8', 'gcc9', 'gcc10', 'gcc11', 'gcc12', 'gcc13', 'gcc14']}
+    - {jobs: ['build'], std: 'all', ctk: '12.X', cxx: ['clang14', 'clang15', 'clang16', 'clang17', 'clang18', 'clang19']}
+    - {jobs: ['build'], std: 'all', ctk: '12.X', cxx: ['msvc2019', 'msvc2022']}
+    # CTK 13.0 full matrix build: default projects
+    - {jobs: ['build'], std: 'all', ctk: '13.0', cxx: ['gcc11', 'gcc12', 'gcc13', 'gcc14']}
+    - {jobs: ['build'], std: 'all', ctk: '13.0', cxx: ['clang15', 'clang16', 'clang17', 'clang18', 'clang19']}
+    - {jobs: ['build'], std: 'all', ctk: '13.0', cxx: ['msvc2019', 'msvc2022']}
+    # CTK '13.X' full matrix build: default projects
+    - {jobs: ['build'], std: 'all', ctk: '13.X', cxx: ['gcc11', 'gcc12', 'gcc13', 'gcc14']}
+    - {jobs: ['build'], std: 'all', ctk: '13.X', cxx: ['clang15', 'clang16', 'clang17', 'clang18', 'clang19']}
+    - {jobs: ['build'], std: 'all', ctk: '13.X', cxx: ['msvc2019', 'msvc2022']}
+    # CTK 12.0 full matrix build: cudax
+    - {jobs: ['build'], project: 'cudax', std: 'all', ctk: '12.0', cxx: ['gcc9', 'gcc10', 'gcc11', 'gcc12']}
+    - {jobs: ['build'], project: 'cudax', std: 'all', ctk: '12.0', cxx: ['clang14']}
+    - {jobs: ['build'], project: 'cudax', std: 'all', ctk: '12.0', cxx: ['msvc14.39']}
+    # CTK 12.X full matrix build: cudax
+    - {jobs: ['build'], project: 'cudax', std: 'all', ctk: '12.X', cxx: ['gcc9', 'gcc10', 'gcc11', 'gcc12', 'gcc13', 'gcc14']}
+    - {jobs: ['build'], project: 'cudax', std: 'all', ctk: '12.X', cxx: ['clang14', 'clang15', 'clang16', 'clang17', 'clang18', 'clang19']}
+    - {jobs: ['build'], project: 'cudax', std: 'all', ctk: '12.X', cxx: ['msvc2022']}
+    # CTK 13.0 full matrix build: cudax
+    - {jobs: ['build'], project: 'cudax', std: 'all', ctk: '13.0', cxx: ['gcc11', 'gcc12', 'gcc13', 'gcc14']}
+    - {jobs: ['build'], project: 'cudax', std: 'all', ctk: '13.0', cxx: ['clang15', 'clang16', 'clang17', 'clang18', 'clang19']}
+    - {jobs: ['build'], project: 'cudax', std: 'all', ctk: '13.0', cxx: ['msvc2022']}
+    # CTK '13.X' full matrix build: cudax
+    - {jobs: ['build'], project: 'cudax', std: 'all', ctk: '13.X', cxx: ['gcc11', 'gcc12', 'gcc13', 'gcc14']}
+    - {jobs: ['build'], project: 'cudax', std: 'all', ctk: '13.X', cxx: ['clang15', 'clang16', 'clang17', 'clang18', 'clang19']}
+    - {jobs: ['build'], project: 'cudax', std: 'all', ctk: '13.X', cxx: ['msvc2022']}
+    # CTK 12.X testing:
+    - {jobs: ['test'], project: 'libcudacxx', ctk: '12.X', std: 'minmax', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx2080'}
+    - {jobs: ['test'], project: 'cub',        ctk: '12.X', std: 'minmax', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtxa6000'}
+    - {jobs: ['test'], project: 'thrust',     ctk: '12.X', std: 'minmax', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx4090'}
+    - {jobs: ['test'], project: 'cudax',      ctk: '12.X', std: 'minmax', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx2080'}
+    - {jobs: ['test'], project: ['libcudacxx', 'cub', 'thrust', 'cudax'], ctk: '12.X', std: 'minmax', gpu: 'h100' }
+    # CTK '13.X' testing:
+    - {jobs: ['test'], project: 'libcudacxx', ctk: '13.X', std: 'minmax', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx2080'}
+    - {jobs: ['test'], project: 'cub',        ctk: '13.X', std: 'minmax', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtxa6000'}
+    - {jobs: ['test'], project: 'thrust',     ctk: '13.X', std: 'minmax', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx4090'}
+    - {jobs: ['test'], project: 'cudax',      ctk: '13.X', std: 'minmax', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx2080'}
+    - {jobs: ['test'], project: ['libcudacxx', 'cub', 'thrust', 'cudax'], ctk: '13.X', std: 'minmax', gpu: 'h100' }
+    # Misc:
+    - {jobs: ['build'], cpu: 'arm64', project: ['libcudacxx', 'cub', 'thrust', 'cudax'], ctk: ['12.X', '13.X'], std: 'all', cxx: ['gcc', 'clang']}
+    # Coming in a later PR after some logistical issues with `-arch all` and RAM usage are resolved:
+    # - {jobs: ['build'], sm: 'all-cccl', project: ['cub', 'thrust', 'libcudacxx', 'cudax'], ctk: ['12.X', '13.X'], std: 'all', cxx: ['gcc', 'msvc'] }
+    - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 'rtx4090'}
+    - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit', gpu: 'rtx4090'}
+    - {jobs: ['limited'], project: 'cub', std: 17, gpu: 'rtx2080'}
+    # NVRTC tests don't currently support 12.0:
+    - {jobs: ['nvrtc'],          project: 'libcudacxx', ctk: [        '12.X', '13.0', '13.X'], cxx: 'gcc12', std: 'all', gpu: 'rtx2080', sm: 'gpu'}
+    - {jobs: ['verify_codegen'], project: 'libcudacxx', ctk: ['12.0', '12.X', '13.0', '13.X'], cxx: 'gcc12'}
+    # c.parallel -- pinned to gcc13 to match python
+    - {jobs: ['test'],  project: ['cccl_c_parallel'], ctk: '12.X', cxx: 'gcc13', gpu: ['rtx2080']}
+    - {jobs: ['test'],  project: ['cccl_c_parallel'], ctk: '13.X', cxx: 'gcc13', gpu: ['rtx2080', 'l4', 'h100']}
+    # Python -- pinned to gcc13 for consistency across CTK images
+    - {jobs: ['test_py_headers', 'test_py_coop'], project: 'python', ctk: ['12.5', '12.6', '12.8', '12.9', '13.0'], py_version: ['3.10', '3.11', '3.12', '3.13'], gpu: 'l4', cxx: 'gcc13'}
+    - {jobs: ['test_py_par', 'test_py_examples'], project: 'python', ctk: [                '12.8', '12.9', '13.0'], py_version: ['3.10', '3.11', '3.12', '3.13'], gpu: 'l4', cxx: 'gcc13'}
+    - {jobs: ['test'], project: 'python', ctk: ['12.X', '13.X'], py_version: '3.13', gpu: 'h100', cxx: 'gcc13'}
+    # CCCL packaging:
+    - {jobs: ['test'], project: 'packaging', ctk: '12.0', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080'}
+    - {jobs: ['test'], project: 'packaging', ctk: '12.X', cxx: ['gcc10', 'clang14'], gpu: 'rtx2080'}
+    - {jobs: ['test'], project: 'packaging', ctk: '13.0', cxx: ['gcc',   'clang'],   gpu: 'rtx2080'}
+    - {jobs: ['test'], project: 'packaging', ctk: '13.X', cxx: ['gcc',   'clang'],   gpu: 'rtx2080'}
+    - {jobs: ['install'], project: 'packaging'}
+    # NVHPC build
+    - {jobs: ['build'], cxx: 'nvhpc-prev', ctk: 'nvhpc-prev', std: 'all', project: ['libcudacxx', 'cub', 'thrust', 'cudax', 'stdpar'], cpu: ['amd64', 'arm64']}
+    - {jobs: ['build'], cxx: 'nvhpc',      ctk: 'nvhpc',      std: 'all', project: ['libcudacxx', 'cub', 'thrust', 'cudax', 'stdpar'], cpu: ['amd64', 'arm64']}
+    # clang-cuda
+    # - Can't add sm90+ until clang20 (#3633)
+    - {jobs: ['build'], cudacxx: 'clang', ctk: '12.X', std: 'all', cxx: 'clang', sm: '75;80'}
+    # - CTK 13.X unsupported: https://gitlab.kitware.com/cmake/cmake/-/merge_requests/11079#note_1692019
+    # - {jobs: ['build'], cudacxx: 'clang', ctk: '13.X', std: 'all', cxx: 'clang', sm: '75;80'}
+    # compute-sanitizer
     - {jobs: ['compute_sanitizer'], project: 'cub', std: 'max', gpu: 'rtxa6000', sm: 'gpu', cmake_options: '-DCMAKE_CUDA_FLAGS=-lineinfo'}
 
   python-wheels:
-    # TODO Just need this line once cccl.parallel tests pass on 12.5 and 12.6:
-    # - {jobs: ['test'], project: 'python', ctk: ['12.5', '12.6', '12.8', '12.9'], py_version: ['3.10', '3.11', '3.12', '3.13'], gpu: 'l4'}
-    # These two can be removed once the above is working.
-    - {jobs: ['test_py_headers', 'test_py_coop', 'test_py_examples'], ctk: ['12.5', '12.6', '12.8', '12.9'], py_version: ['3.10', '3.11', '3.12', '3.13'], project: 'python', gpu: 'l4'}
-    - {jobs: ['test_py_par'],                                         ctk: ['12.8', '12.9'],                 py_version: ['3.10', '3.11', '3.12', '3.13'], project: 'python', gpu: 'l4'}
+    - {jobs: ['test_py_headers', 'test_py_coop'], project: 'python', ctk: ['12.5', '12.6', '12.8', '12.9', '13.0'], py_version: ['3.10', '3.11', '3.12', '3.13'], gpu: 'l4', cxx: 'gcc13'}
+    - {jobs: ['test_py_par', 'test_py_examples'], project: 'python', ctk: [                '12.8', '12.9', '13.0'], py_version: ['3.10', '3.11', '3.12', '3.13'], gpu: 'l4', cxx: 'gcc13'}
+    - {jobs: ['test'], project: 'python', ctk: ['12.X', '13.X'], py_version: '3.13', gpu: 'h100', cxx: 'gcc13'}
+
+  # This is just used to ensure that we generate devcontainers for all images we build.
+  # These do not map to any actual jobs.
+  devcontainers:
+    - {jobs: ['dc'],     ctk: ['12.0', '12.9'        ], cxx: ['clang14']}
+    - {jobs: ['dc'],     ctk: ['12.0', '12.9'        ], cxx: ['gcc7', 'gcc8', 'gcc9', 'gcc10']}
+    - {jobs: ['dc'],     ctk: ['12.0', '12.9', '13.0'], cxx: ['gcc11', 'gcc12']}
+    - {jobs: ['dc'],     ctk: [        '12.9', '13.0'], cxx: ['gcc13']}
+    - {jobs: ['dc'],     ctk: [        '12.9', '13.0'], cxx: ['clang15', 'clang16', 'clang17', 'clang18']}
+    - {jobs: ['dc'],     ctk: [        '12.9', '13.0'], cxx: ['gcc14', 'clang19']}
+    - {jobs: ['dc_ext'], ctk: [        '12.9', '13.0'], cxx: ['gcc14', 'clang19']}
+    # 12.X python images, pinned at gcc13 for consistency, as 12.5 / 12.6 don't support newer gcc.
+    - {jobs: ['dc'], ctk: ['12.5', '12.6', '12.8', '12.9', '13.0'], cxx: 'gcc13'}
+    # NVHPC
+    - {jobs: ['dc'], cxx: 'nvhpc-prev', ctk: 'nvhpc-prev'}
+    - {jobs: ['dc'], cxx: 'nvhpc',      ctk: 'nvhpc'}
 
   # Any generated jobs that match the entries in `exclude` will be removed from the final matrix for all workflows.
   exclude:
@@ -150,16 +254,27 @@ workflows:
 # The version of the devcontainer images to use from https://hub.docker.com/r/rapidsai/devcontainers
 devcontainer_version: '25.10'
 
+# Compiler versions used for the cuda99.X internal builds:
+cuda99_gcc_version: 13
+cuda99_clang_version: 19
+
 # All supported C++ standards:
 all_stds: [17, 20]
 
+# Aliases:
+# - 12.X: Newest CTK 12.X version.
+# - 13.X: Newest CTK 13.X version.
+# - nvhpc: CTK shipped in newest NVHPC
+# - nvhpc-prev: CTK shipped in previous NVHPC
+# - pybuild: Selects image to use for python wheel builds' outer docker instance
 ctk_versions:
   12.0: { stds: [17, 20] }
   12.5: { stds: [17, 20] }
   12.6: { stds: [17, 20] }
   # 12.7 not buildable by current devcontainer scripting.
   12.8: { stds: [17, 20] }
-  12.9: { stds: [17, 20], aka: 'curr' }
+  12.9: { stds: [17, 20], alias: ['12.X', 'nvhpc', 'nvhpc-prev', 'pybuild'] }
+  13.0: { stds: [17, 20], alias: ['13.X'] }
 
 device_compilers:
   nvcc: # Version / stds are taken from CTK
@@ -182,6 +297,7 @@ host_compilers:
       11: { stds: [17, 20] }
       12: { stds: [17, 20] }
       13: { stds: [17, 20] }
+      14: { stds: [17, 20] }
   clang:
     name: 'Clang'
     container_tag: 'llvm'
@@ -198,24 +314,27 @@ host_compilers:
     container_tag: 'cl'
     exe: cl
     versions:
-      14.29: { stds: [ 17,   ], aka: '2019' }
-      14.39: { stds: [ 17, 20]} # CTK 12.0 doesn't recognize >14.39 as MSVC 2022.
-      14.43: { stds: [ 17, 20], aka: '2022' }
+      14.29: { stds: [17,   ], alias: '2019' }
+      14.39: { stds: [17, 20] } # CTK 12.0 doesn't recognize >14.39 as MSVC 2022.
+      14.43: { stds: [17, 20], alias: '2022' }
   nvhpc:
     name: 'NVHPC'
     container_tag: 'nvhpc'
     exe: nvc++
     versions:
-      25.7: { stds: [17, 20 ] }
+      # !! Update the ctk_versions 'nvhpc*' aliases when updating NVHPC versions:
+      25.5: { stds: [17, 20], alias: 'prev' }
+      25.7: { stds: [17, 20] }
 
 # Jobs support the following properties:
 #
-# - gpu: Whether the job requires a GPU runner. Default is false.
 # - name: The human-readable name of the job. Default is the capitalized job key.
 # - needs:
 #   - A list of jobs that must be completed before this job can run. Default is an empty list.
-#   - These are automatically added if needed:
+#   - These jobs are automatically added if needed:
 #     - Eg. "jobs: ['test']" in the workflow def will also create the required 'build' jobs.
+# - gpu: Whether the job requires a GPU runner. Default is false.
+# - cuda_ext: Whether the job requires a devcontainer with extra CUDA libraries. Default is false.
 # - invoke:
 #   - Map the job type to the script invocation spec:
 #     - prefix: The script invocation prefix. Default is the job name.
@@ -230,6 +349,10 @@ host_compilers:
 #   - E.g. "force_producer_ctk: '12.0'" on a test step will force the generated build step to use CTK 12.0.
 
 jobs:
+  # Only used for generating devcontainers. No scripts actually exist for these:
+  dc:     { gpu: false }
+  dc_ext: { gpu: false, cuda_ext: true }
+
   # General:
   build:        { gpu: false }
   test:         { gpu: true, needs: 'build' }
@@ -272,10 +395,14 @@ jobs:
 
   # Python:
   build_py_wheel:   { name: "Build cuda.cccl",             gpu: false, invoke: { prefix: 'build_cuda_cccl'} }
-  test_py_headers:  { name: "Test cuda.cccl.headers",      gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "12.9", invoke: { prefix: 'test_cuda_cccl_headers'} }
-  test_py_coop:     { name: "Test cuda.cccl.cooperative",  gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "12.9", invoke: { prefix: 'test_cuda_cccl_cooperative'} }
-  test_py_par:      { name: "Test cuda.cccl.parallel",     gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "12.9", invoke: { prefix: 'test_cuda_cccl_parallel'} }
-  test_py_examples: { name: "Test cuda.cccl.examples",     gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "12.9", invoke: { prefix: 'test_cuda_cccl_examples'} }
+  test_py_headers:  { name: "Test cuda.cccl.headers",      gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_headers'} }
+  test_py_coop:     { name: "Test cuda.cccl.cooperative",  gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_cooperative'} }
+  test_py_par:      { name: "Test cuda.cccl.parallel",     gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_parallel'} }
+  test_py_examples: { name: "Test cuda.cccl.examples",     gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_examples'} }
+
+  # Only used for generating devcontainers. No scripts actually exist for these:
+  dc:     { gpu: false }
+  dc_ext: { gpu: false, cuda_ext: true }
 
 # Projects have the following properties:
 #
@@ -350,7 +477,7 @@ tags:
   jobs: { required: true }
   # CUDA ToolKit version
   # See the `ctks` map.
-  ctk: { default: 'curr' }
+  ctk: { default: '13.X' }
   # CPU architecture
   cpu: { default: 'amd64' }
   # GPU model
diff --git a/ci/matx/build_matx.sh b/ci/matx/build_matx.sh
index f15f98bca3f..9057a89d132 100755
--- a/ci/matx/build_matx.sh
+++ b/ci/matx/build_matx.sh
@@ -95,7 +95,7 @@ rm -rf build
 mkdir build
 cd build
 cmake -G Ninja ../MatX \
-  "-DCMAKE_CUDA_ARCHITECTURES=60;70;80" \
+  "-DCMAKE_CUDA_ARCHITECTURES=75;80" \
   "-DRAPIDS_CMAKE_CPM_OVERRIDE_VERSION_FILE=${version_override_file}" \
   -DMATX_BUILD_TESTS=ON \
   -DMATX_BUILD_EXAMPLES=ON \
diff --git a/ci/util/artifacts/download/fetch.sh b/ci/util/artifacts/download/fetch.sh
index 9e1ce9740d8..2f1c682eaaa 100755
--- a/ci/util/artifacts/download/fetch.sh
+++ b/ci/util/artifacts/download/fetch.sh
@@ -29,6 +29,7 @@ mkdir -p "$2"
 readonly target_directory="$(cd "$2" && pwd)"
 
 echo "Downloading artifact '$artifact_name' to '$target_directory'"
-gh run download ${GITHUB_RUN_ID} \
-  --name "$artifact_name" \
-  --dir "$target_directory"
+"$ci_dir/util/retry.sh" 5 30 \
+  gh run download ${GITHUB_RUN_ID} \
+    --name "$artifact_name" \
+    --dir "$target_directory"
diff --git a/cmake/CCCLCheckCudaArchitectures.cmake b/cmake/CCCLCheckCudaArchitectures.cmake
new file mode 100644
index 00000000000..8ceea67f9cb
--- /dev/null
+++ b/cmake/CCCLCheckCudaArchitectures.cmake
@@ -0,0 +1,106 @@
+# This file provides utilities to handle special CMAKE_CUDA_ARCHITECTURES lists for CCCL.
+#
+# If CMAKE_CUDA_ARCHITECTURES is set to one of the following values, it will be replaced
+# as described:
+#
+# 'all-cccl': All architectures known to the current NVCC above minimum_cccl_arch.
+#
+# 'all-major-cccl': All major architectures known to the current NVCC above minimum_cccl_arch,
+# plus 'minimum_cccl_arch'.
+#
+# For example on 12.9:
+#   all: 50-real;52-real;53-real;60-real;61-real;62-real;70-real;72-real;75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;103-real;120-real;121-real;121-virtual
+#   all-cccl: 75-real;80-real;86-real;87-real;89-real;90-real;100-real;101-real;103-real;120-real;121-real;121-virtual
+#   all-major: 50-real;60-real;70-real;80-real;90-real;100-real;120-real;120-virtual
+#   all-major-cccl: 75-real;80-real;90-real;100-real;120-real;120-virtual
+
+# We don't support arches below what the latest CTK release supports:
+set(minimum_cccl_arch 75) # 13.x dropped below Turing
+
+# Check CMAKE_CUDA_ARCHITECTURES for special CCCL values and update as described above.
+function(cccl_check_cuda_architectures)
+  if (CMAKE_CUDA_ARCHITECTURES MATCHES "-cccl$")
+    message(STATUS "Detected special CCCL arch request: CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}")
+
+    _cccl_detect_nvcc_arch_support(arches)
+    _cccl_filter_to_supported_arches(arches)
+
+    if(CMAKE_CUDA_ARCHITECTURES STREQUAL "all-major-cccl")
+      _cccl_filter_to_all_major_cccl(arches)
+    elseif(CMAKE_CUDA_ARCHITECTURES STREQUAL "all-cccl")
+      # No further filtering needed, just use the arches as is.
+    else()
+      message(FATAL_ERROR "Invalid CMAKE_CUDA_ARCHITECTURES value: ${CMAKE_CUDA_ARCHITECTURES}")
+    endif()
+
+    _cccl_add_real_virtual_arch_tags(arches)
+    message(STATUS "Replacing with CMAKE_CUDA_ARCHITECTURES=${arches}")
+    set(CMAKE_CUDA_ARCHITECTURES "${arches}" CACHE STRING "CUDA architectures for CCCL" FORCE)
+  endif()
+endfunction()
+
+# Query nvcc --help to determine which architectures are supported.
+function(_cccl_detect_nvcc_arch_support arches_var)
+  find_package(CUDAToolkit)
+  if (NOT CUDAToolkit_FOUND)
+    message(FATAL_ERROR "CUDAToolkit not found, '${CMAKE_CUDA_ARCHITECTURES}' arch detection failed.")
+  endif()
+
+  execute_process(
+    COMMAND "${CUDAToolkit_NVCC_EXECUTABLE}" --help
+    OUTPUT_VARIABLE nvcc_help_output
+    COMMAND_ERROR_IS_FATAL ANY
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+  )
+
+  string(REGEX MATCHALL "compute_[0-9]+" supported_arches "${nvcc_help_output}")
+  string(REPLACE "compute_" "" supported_arches "${supported_arches}")
+  list(SORT supported_arches COMPARE NATURAL)
+  list(REMOVE_DUPLICATES supported_arches)
+  message(VERBOSE "NVCC supports: ${supported_arches}")
+  set(${arches_var} ${supported_arches} PARENT_SCOPE)
+endfunction()
+
+# Remove all arches < minimum_cccl_arch
+function(_cccl_filter_to_supported_arches arches_var)
+  set(cccl_arches "")
+  foreach(arch IN LISTS ${arches_var})
+    if (arch GREATER_EQUAL minimum_cccl_arch)
+      list(APPEND cccl_arches ${arch})
+    endif()
+  endforeach()
+  message(VERBOSE "CCCL supported arches: ${cccl_arches}")
+  set(${arches_var} ${cccl_arches} PARENT_SCOPE)
+endfunction()
+
+# Convert all-cccl to all-major-cccl.
+function(_cccl_filter_to_all_major_cccl arches_var)
+  set(major_arches "")
+  foreach(arch IN LISTS ${arches_var})
+    math(EXPR major "(${arch} / 10) * 10")
+    if (major LESS minimum_cccl_arch)
+      set(major "${minimum_cccl_arch}")
+    endif()
+    if (NOT major IN_LIST major_arches)
+      list(APPEND major_arches ${major})
+    endif()
+  endforeach()
+  message(VERBOSE "CCCL all-major arches: ${major_arches}")
+  set(${arches_var} ${major_arches} PARENT_SCOPE)
+endfunction()
+
+function(_cccl_add_real_virtual_arch_tags arches_var)
+  set(tagged_arches "")
+
+  list(POP_BACK ${arches_var} last_arch)
+
+  foreach(arch IN LISTS ${arches_var})
+    list(APPEND tagged_arches "${arch}-real")
+  endforeach()
+
+  list(APPEND tagged_arches "${last_arch}-real")
+  list(APPEND tagged_arches "${last_arch}-virtual")
+
+  message(VERBOSE "CCCL tagged arches: ${tagged_arches}")
+  set(${arches_var} ${tagged_arches} PARENT_SCOPE)
+endfunction()
diff --git a/cub/test/ptx-json/CMakeLists.txt b/cub/test/ptx-json/CMakeLists.txt
index f0fcea76697..cd7aaf0595b 100644
--- a/cub/test/ptx-json/CMakeLists.txt
+++ b/cub/test/ptx-json/CMakeLists.txt
@@ -64,7 +64,9 @@ function(cub_detail_ptx_json_add_test target_name_var source)
     )
     set_target_properties(${target_name} PROPERTIES
       CUDA_PTX_COMPILATION ON
-      CUDA_ARCHITECTURES 90
+      # Use compute_80 -- CTK 13.0 started running ptxas on 90+, even when just producing PTX.
+      # This breaks the ptx-json stuff, which produces intentionally invalid PTX.
+      CUDA_ARCHITECTURES "80-virtual"
     )
 
     add_test(NAME ${target_name}
diff --git a/cudax/include/cuda/experimental/__graph/graph_node_ref.cuh b/cudax/include/cuda/experimental/__graph/graph_node_ref.cuh
index 3fcfe3cd609..f854829f6f1 100644
--- a/cudax/include/cuda/experimental/__graph/graph_node_ref.cuh
+++ b/cudax/include/cuda/experimental/__graph/graph_node_ref.cuh
@@ -251,7 +251,7 @@ struct graph_node_ref
         __graph_,
         __deps.data(), // dependencies
         __src_arr.get(), // dependant nodes
-        __nullptr, // no edge data
+        nullptr, // no edge data
         __deps.size()); // number of dependencies
 #else
       _CCCL_TRY_CUDA_API(
diff --git a/libcudacxx/test/libcudacxx/cuda/annotated_ptr/annotated_ptr_constexpr.pass.cpp b/libcudacxx/test/libcudacxx/cuda/annotated_ptr/annotated_ptr_constexpr.pass.cpp
index dbb891a2c79..31e001c87f0 100644
--- a/libcudacxx/test/libcudacxx/cuda/annotated_ptr/annotated_ptr_constexpr.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/annotated_ptr/annotated_ptr_constexpr.pass.cpp
@@ -10,7 +10,7 @@
 // UNSUPPORTED: nvrtc
 
 // error: expression must have a constant value annotated_ptr.h: note #2701-D: attempt to access run-time storage
-// UNSUPPORTED: clang-14, gcc-11, gcc-10, gcc-9, gcc-8, gcc-7, msvc-19.29
+// UNSUPPORTED: clang-14, gcc-12, gcc-11, gcc-10, gcc-9, gcc-8, gcc-7, msvc-19.29
 // UNSUPPORTED: msvc && nvcc-12.0
 
 #include <cuda/annotated_ptr>
diff --git a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/default_accessor/accessor.submdspan.pass.cpp b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/default_accessor/accessor.submdspan.pass.cpp
index b095bebecfa..ad94155c7b8 100644
--- a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/default_accessor/accessor.submdspan.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/default_accessor/accessor.submdspan.pass.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 // nvbug5272086
-// UNSUPPORTED: nvcc-12.9 && msvc
+// UNSUPPORTED: msvc
 
 #include <cuda/mdspan>
 #include <cuda/std/type_traits>
diff --git a/libcudacxx/test/libcudacxx/cuda/iterators/transform_input_output_iterator/ctor.value.pass.cpp b/libcudacxx/test/libcudacxx/cuda/iterators/transform_input_output_iterator/ctor.value.pass.cpp
index 6cad936e0c9..440cd396dc1 100644
--- a/libcudacxx/test/libcudacxx/cuda/iterators/transform_input_output_iterator/ctor.value.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/iterators/transform_input_output_iterator/ctor.value.pass.cpp
@@ -34,10 +34,10 @@ __host__ __device__ constexpr bool test()
     buffer[2] = 2;
 
     // The test iterators are not `is_nothrow_move_constructible`
-#if !TEST_COMPILER(GCC, <, 9) && !TEST_COMPILER(MSVC2019)
+#if !TEST_COMPILER(GCC, <, 9) && !TEST_COMPILER(MSVC)
     static_assert(
       !noexcept(cuda::transform_input_output_iterator{random_access_iterator{buffer + 2}, input_func, output_func}));
-#endif // !TEST_COMPILER(GCC, <, 9) && !TEST_COMPILER(MSVC2019)
+#endif // !TEST_COMPILER(GCC, <, 9) && !TEST_COMPILER(MSVC)
     static_assert(
       cuda::std::is_same_v<decltype(iter),
                            cuda::transform_input_output_iterator<random_access_iterator<int*>, InputFn, OutputFn>>);
@@ -64,11 +64,11 @@ __host__ __device__ constexpr bool test()
     assert(buffer[2] == output_func(3));
     buffer[2] = 2;
 
-#if !TEST_COMPILER(GCC, <, 9) && !TEST_COMPILER(MSVC2019)
+#if !TEST_COMPILER(GCC, <, 9) && !TEST_COMPILER(MSVC)
     // The test iterators are not `is_nothrow_move_constructible`
     static_assert(!noexcept(cuda::transform_input_output_iterator<random_access_iterator<int*>, InputFn, OutputFn>{
       random_access_iterator{buffer + 2}, input_func, output_func}));
-#endif // !TEST_COMPILER(GCC, <, 9) && !TEST_COMPILER(MSVC2019)
+#endif // !TEST_COMPILER(GCC, <, 9) && !TEST_COMPILER(MSVC)
   }
 
   {
diff --git a/libcudacxx/test/libcudacxx/cuda/iterators/transform_output_iterator/ctor.value.pass.cpp b/libcudacxx/test/libcudacxx/cuda/iterators/transform_output_iterator/ctor.value.pass.cpp
index 4c6c680d445..0eaab87cfb1 100644
--- a/libcudacxx/test/libcudacxx/cuda/iterators/transform_output_iterator/ctor.value.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/iterators/transform_output_iterator/ctor.value.pass.cpp
@@ -30,10 +30,10 @@ __host__ __device__ constexpr bool test()
     *iter = 3;
     assert(buffer[2] == 3 + 1);
     buffer[2] = 2;
-#if !TEST_COMPILER(GCC, <, 9) && !TEST_COMPILER(MSVC2019)
+#if !TEST_COMPILER(GCC, <, 9) && !TEST_COMPILER(MSVC)
     // The test iterators are not `is_nothrow_move_constructible`
     static_assert(!noexcept(cuda::transform_output_iterator{random_access_iterator{buffer + 2}, func}));
-#endif // !TEST_COMPILER(GCC, <, 9) && !TEST_COMPILER(MSVC2019)
+#endif // !TEST_COMPILER(GCC, <, 9) && !TEST_COMPILER(MSVC)
     static_assert(
       cuda::std::is_same_v<decltype(iter), cuda::transform_output_iterator<random_access_iterator<int*>, Fn>>);
   }
@@ -54,11 +54,11 @@ __host__ __device__ constexpr bool test()
     *iter = 3;
     assert(buffer[2] == 3 + 1);
     buffer[2] = 2;
-#if !TEST_COMPILER(GCC, <, 9) && !TEST_COMPILER(MSVC2019)
+#if !TEST_COMPILER(GCC, <, 9) && !TEST_COMPILER(MSVC)
     // The test iterators are not `is_nothrow_move_constructible`
     static_assert(!noexcept(
       cuda::transform_output_iterator<random_access_iterator<int*>, Fn>{random_access_iterator{buffer + 2}, func}));
-#endif // !TEST_COMPILER(GCC, <, 9) && !TEST_COMPILER(MSVC2019)
+#endif // !TEST_COMPILER(GCC, <, 9) && !TEST_COMPILER(MSVC)
   }
 
   {
diff --git a/libcudacxx/test/libcudacxx/libcxx/macros/architecture.compile.pass.cpp b/libcudacxx/test/libcudacxx/libcxx/macros/architecture.compile.pass.cpp
index 345fce365f1..4ef1da4884d 100644
--- a/libcudacxx/test/libcudacxx/libcxx/macros/architecture.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/libcxx/macros/architecture.compile.pass.cpp
@@ -10,19 +10,21 @@
 #include <cuda/std/__cccl/architecture.h>
 #include <cuda/std/__cccl/compiler.h>
 
-#if !defined(__CUDACC_RTC__)
+#if !_CCCL_COMPILER(NVRTC)
 #  if _CCCL_ARCH(X86_64)
 #    if _CCCL_COMPILER(MSVC)
 #      include <intrin.h>
 #    elif _CCCL_COMPILER(GCC) || _CCCL_COMPILER(CLANG)
 #      include <cpuid.h>
-#    endif
-#  endif
+#    endif // _CCCL_COMPILER(GCC) || _CCCL_COMPILER(CLANG)
+#  endif // _CCCL_ARCH(X86_64)
 
-#  if _CCCL_ARCH(ARM64) && defined(__ARM_ACLE)
-#    include <arm_acle.h>
-#  endif
-#endif
+#  if !_CCCL_COMPILER(NVHPC) // nvbug5395777
+#    if _CCCL_ARCH(ARM64) && defined(__ARM_ACLE)
+#      include <arm_acle.h>
+#    endif // _CCCL_ARCH(ARM64) && defined(__ARM_ACLE)
+#  endif // !_CCCL_COMPILER(NVHPC)
+#endif // !_CCCL_COMPILER(NVRTC)
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/mdspan/deduction.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/mdspan/deduction.pass.cpp
index 9c9dcf8c568..d87c43c4bb2 100644
--- a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/mdspan/deduction.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/mdspan/deduction.pass.cpp
@@ -48,7 +48,7 @@
 //                typename MappingType::layout_type, AccessorType>;
 
 // nvbug5272086
-// UNSUPPORTED: nvcc-12.9 && msvc
+// UNSUPPORTED: msvc
 
 #include <cuda/std/cassert>
 #include <cuda/std/concepts>
diff --git a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/submdspan/layout_left.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/submdspan/layout_left.pass.cpp
index 4b506b70ecf..8b77ef07838 100644
--- a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/submdspan/layout_left.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/submdspan/layout_left.pass.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 // nvbug5272086
-// UNSUPPORTED: nvcc-12.9 && msvc
+// UNSUPPORTED: msvc
 
 // <mdspan>
 
diff --git a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/submdspan/layout_right.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/submdspan/layout_right.pass.cpp
index d1bf237a8ce..1e9c23c5096 100644
--- a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/submdspan/layout_right.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/submdspan/layout_right.pass.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 // nvbug5272086
-// UNSUPPORTED: nvcc-12.9 && msvc
+// UNSUPPORTED: msvc
 
 // <mdspan>
 
diff --git a/libcudacxx/test/libcudacxx/std/ranges/range.adaptors/range.all/range.all/all.pass.cpp b/libcudacxx/test/libcudacxx/std/ranges/range.adaptors/range.all/range.all/all.pass.cpp
index 541ab57bb42..4662f91fb7f 100644
--- a/libcudacxx/test/libcudacxx/std/ranges/range.adaptors/range.all/range.all/all.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/ranges/range.adaptors/range.all/range.all/all.pass.cpp
@@ -161,9 +161,9 @@ __host__ __device__ constexpr bool test()
     static_assert(cuda::std::is_same_v<decltype(cuda::std::views::all(View<true>())), View<true>>);
     static_assert(noexcept(cuda::std::views::all(View<true>())));
 // old GCC seems to fall over the noexcept clauses here
-#if !TEST_COMPILER(GCC, <, 9) && !TEST_COMPILER(MSVC2019)
+#if !TEST_COMPILER(GCC, <, 9) && !TEST_COMPILER(MSVC)
     static_assert(!noexcept(cuda::std::views::all(View<false>())));
-#endif // no broken noexcept
+#endif // !TEST_COMPILER(GCC, <, 9) && !TEST_COMPILER(MSVC)
 
     auto viewCopy = cuda::std::views::all(View<true>(2));
     static_assert(cuda::std::is_same_v<decltype(viewCopy), View<true>>);
@@ -175,9 +175,9 @@ __host__ __device__ constexpr bool test()
     static_assert(cuda::std::is_same_v<decltype(cuda::std::views::all(cuda::std::declval<const CopyableView<true>&>())),
                                        CopyableView<true>>);
     static_assert(noexcept(cuda::std::views::all(CopyableView<true>())));
-#if !TEST_COMPILER(GCC, <, 9) && !TEST_COMPILER(MSVC2019)
+#if !TEST_COMPILER(GCC, <, 9) && !TEST_COMPILER(MSVC)
     static_assert(!noexcept(cuda::std::views::all(CopyableView<false>())));
-#endif // no broken noexcept
+#endif // !TEST_COMPILER(GCC, <, 9) && !TEST_COMPILER(MSVC)
 
     CopyableView<true> view(2);
     auto viewCopy = cuda::std::views::all(view);
diff --git a/libcudacxx/test/utils/libcudacxx/test/config.py b/libcudacxx/test/utils/libcudacxx/test/config.py
index 489fc033347..ee475e78790 100644
--- a/libcudacxx/test/utils/libcudacxx/test/config.py
+++ b/libcudacxx/test/utils/libcudacxx/test/config.py
@@ -12,6 +12,7 @@
 import re
 import shlex
 import shutil
+import subprocess
 import sys
 
 import libcudacxx.util
@@ -194,6 +195,70 @@ def get_compute_capabilities(self):
         )
         return deduced_comput_archs_str
 
+    def _get_nvcc_archs(self):
+        if self.cxx.type != "nvcc":
+            self.lit_config.fatal(
+                "Retrieving compute capabilities is only supported for nvcc compiler type"
+            )
+            return []
+
+        cmd = (
+            f"{self.cxx.path} --help | grep -oE 'compute_[0-9]+' | "
+            "sed -E 's/compute_//g' | sort -ug"
+        )
+        result = subprocess.run(
+            cmd,
+            shell=True,
+            check=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            universal_newlines=True,
+        )
+        archs = result.stdout.strip().splitlines()
+
+        if not archs:
+            self.lit_config.fatal(
+                "Failed to retrieve compute capabilities or no capabilities found."
+            )
+            return []
+
+        return sorted(set(int(arch) for arch in archs))
+
+    def get_all_major_compute_capabilities(self):
+        archs = self._get_nvcc_archs()
+        if not archs:
+            return ""
+
+        # Build the same list used by --arch=all-major:
+
+        # Handle special case where the first architecture is not a round decade (e.g., first arch is 75, not 70).
+        oldest = archs[0]
+        archs = sorted(set((arch // 10 * 10) for arch in archs))
+        archs[0] = oldest
+        last_arch = archs[-1]
+        archs = [f"{arch}-real" for arch in archs]
+        archs.append(f"{last_arch}-virtual")
+
+        archs = ";".join(archs)
+
+        self.lit_config.note("Deduced major compute capabilities are: %s" % archs)
+
+        return archs
+
+    def get_all_compute_capabilities(self):
+        archs = self._get_nvcc_archs()
+        if not archs:
+            return ""
+        last_arch = archs[-1]
+        archs = [f"{arch}-real" for arch in archs]
+        archs.append(f"{last_arch}-virtual")
+
+        archs = ";".join(archs)
+
+        self.lit_config.note("Deduced compute capabilities are: %s" % archs)
+
+        return archs
+
     def get_modules_enabled(self):
         return self.get_lit_bool(
             "enable_modules", default=False, env_var="LIBCUDACXX_ENABLE_MODULES"
@@ -752,8 +817,12 @@ def configure_compile_flags(self):
             self.lit_config.note("Compute Archs: %s" % compute_archs)
             if compute_archs == "native":
                 compute_archs = self.get_compute_capabilities()
+            elif compute_archs == "all":
+                compute_archs = self.get_all_compute_capabilities()
+            elif compute_archs == "all-major":
+                compute_archs = self.get_all_major_compute_capabilities()
 
-            compute_archs = set(sorted(re.split("\\s|;|,", compute_archs)))
+            compute_archs = sorted(set(re.split("\\s|;|,", compute_archs)))
             for s in compute_archs:
                 # Split arch and mode i.e. 80-virtual -> 80, virtual
                 arch, *mode = re.split("-", s)