huggingface · alvarobartt · Mar 31, 2026 · Mar 15, 2026 · alvarobartt · Mar 30, 2026
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -126,7 +126,7 @@ jobs:
           context: .
           file: ${{ matrix.dockerfile }}
           push: ${{ github.event_name != 'pull_request' }}
-          platforms: "linux/amd64"
+          platforms: ${{ matrix.platforms || 'linux/amd64' }}
           build-args: |
             SCCACHE_GHA_ENABLED=${{ matrix.sccache }}
             CUDA_COMPUTE_CAP=${{ matrix.cudaComputeCap }}
@@ -168,7 +168,7 @@ jobs:
           target: grpc
           file: ${{ matrix.dockerfile }}
           push: ${{ github.event_name != 'pull_request' }}
-          platforms: "linux/amd64"
+          platforms: ${{ matrix.platforms || 'linux/amd64' }}
           build-args: |
             SCCACHE_GHA_ENABLED=${{ matrix.sccache }}
             CUDA_COMPUTE_CAP=${{ matrix.cudaComputeCap }}

diff --git a/.github/workflows/matrix.json b/.github/workflows/matrix.json
@@ -63,6 +63,16 @@
     "grpc": true,
     "dockerfile": "Dockerfile-cuda"
   },
+  {
+    "name": "blackwell-121",
+    "imageNamePrefix": "121-",
+    "runOn": "always",
+    "sccache": true,
+    "cudaComputeCap": 121,
+    "platforms": "linux/amd64,linux/arm64",
+    "grpc": true,
+    "dockerfile": "Dockerfile-cuda"
+  },
   {
     "name": "all",
     "imageNamePrefix": "cuda-",
@@ -79,6 +89,15 @@
     "grpc": true,
     "dockerfile": "Dockerfile"
   },
+  {
+    "name": "cpu-arm64",
+    "imageNamePrefix": "cpu-arm64-",
+    "runOn": "always",
+    "sccache": true,
+    "platforms": "linux/arm64",
+    "grpc": true,
+    "dockerfile": "Dockerfile-arm64"
+  },
   {
     "name": "cpu-ipex",
     "imageNamePrefix": "cpu-ipex-",

diff --git a/Dockerfile-cuda b/Dockerfile-cuda
@@ -12,8 +12,14 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
     pkg-config \
     && rm -rf /var/lib/apt/lists/*
 
-# Donwload and configure sccache
-RUN curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache && \
+# Download and configure sccache (multi-arch)
+ARG TARGETARCH
+RUN case "${TARGETARCH}" in \
+    "amd64") SCCACHE_ARCH=x86_64-unknown-linux-musl ;; \
+    "arm64") SCCACHE_ARCH=aarch64-unknown-linux-musl ;; \
+    *) echo "Unsupported arch: ${TARGETARCH}"; exit 1 ;; \
+    esac && \
+    curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-${SCCACHE_ARCH}.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-${SCCACHE_ARCH}/sccache && \
     chmod +x /usr/local/bin/sccache
 
 COPY rust-toolchain.toml rust-toolchain.toml
@@ -65,6 +71,9 @@ RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     elif [ ${CUDA_COMPUTE_CAP} -eq 120 ]; \
     then  \
     nvprune --generate-code code=sm_120 /usr/local/cuda/lib64/libcublas_static.a -o /usr/local/cuda/lib64/libcublas_static.a; \
+    elif [ ${CUDA_COMPUTE_CAP} -eq 121 ]; \
+    then  \
+    nvprune --generate-code code=sm_120 --generate-code code=sm_121 /usr/local/cuda/lib64/libcublas_static.a -o /usr/local/cuda/lib64/libcublas_static.a; \
     else  \
     echo "cuda compute cap ${CUDA_COMPUTE_CAP} is not supported"; exit 1; \
     fi;
@@ -103,7 +112,13 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
     unzip \
     && rm -rf /var/lib/apt/lists/*
 
-RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
+ARG TARGETARCH
+RUN case "${TARGETARCH}" in \
+    "amd64") PROTOC_ARCH=x86_64 ;; \
+    "arm64") PROTOC_ARCH=aarch_64 ;; \
+    *) echo "Unsupported arch: ${TARGETARCH}"; exit 1 ;; \
+    esac && \
+    PROTOC_ZIP=protoc-21.12-linux-${PROTOC_ARCH}.zip && \
     curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
     unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
     unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \

diff --git a/README.md b/README.md
@@ -42,7 +42,7 @@ length of 512 tokens:
 - [Local Install](#local-install)
     - [Apple Silicon (Homebrew)](#apple-silicon-homebrew)
 - [Docker Build](#docker-build)
-    - [Apple M1/M2 Arm](#apple-m1m2-arm64-architectures)
+    - [ARM64 / aarch64](#arm64--aarch64)
 - [Examples](#examples)
 
 Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence
@@ -336,17 +336,19 @@ Options:
 
 Text Embeddings Inference ships with multiple Docker images that you can use to target a specific backend:
 
-| Architecture                           | Image                                                                   |
-|----------------------------------------|-------------------------------------------------------------------------|
-| CPU                                    | ghcr.io/huggingface/text-embeddings-inference:cpu-1.9                   |
-| Volta                                  | NOT SUPPORTED                                                           |
-| Turing (T4, RTX 2000 series, ...)      | ghcr.io/huggingface/text-embeddings-inference:turing-1.9 (experimental) |
-| Ampere 8.0 (A100, A30)                 | ghcr.io/huggingface/text-embeddings-inference:1.9                       |
-| Ampere 8.6 (A10, A40, ...)             | ghcr.io/huggingface/text-embeddings-inference:86-1.9                    |
-| Ada Lovelace (RTX 4000 series, ...)    | ghcr.io/huggingface/text-embeddings-inference:89-1.9                    |
-| Hopper (H100)                          | ghcr.io/huggingface/text-embeddings-inference:hopper-1.9                |
-| Blackwell 10.0 (B200, GB200, ...)      | ghcr.io/huggingface/text-embeddings-inference:100-1.9 (experimental)    |
-| Blackwell 12.0 (GeForce RTX 50X0, ...) | ghcr.io/huggingface/text-embeddings-inference:120-1.9 (experimental)    |
+| Architecture                           | Platform | Image                                                                   |
+|----------------------------------------|----------|-------------------------------------------------------------------------|
+| CPU                                    | x86_64   | ghcr.io/huggingface/text-embeddings-inference:cpu-1.9                   |
+| CPU                                    | aarch64  | ghcr.io/huggingface/text-embeddings-inference:cpu-arm64-1.9             |
+| Volta                                  | x86_64   | NOT SUPPORTED                                                           |
+| Turing (T4, RTX 2000 series, ...)      | x86_64   | ghcr.io/huggingface/text-embeddings-inference:turing-1.9 (experimental) |
+| Ampere 8.0 (A100, A30)                 | x86_64   | ghcr.io/huggingface/text-embeddings-inference:1.9                       |
+| Ampere 8.6 (A10, A40, ...)             | x86_64   | ghcr.io/huggingface/text-embeddings-inference:86-1.9                    |
+| Ada Lovelace (RTX 4000 series, ...)    | x86_64   | ghcr.io/huggingface/text-embeddings-inference:89-1.9                    |
+| Hopper (H100)                          | x86_64   | ghcr.io/huggingface/text-embeddings-inference:hopper-1.9                |
+| Blackwell 10.0 (B200, GB200, ...)      | x86_64   | ghcr.io/huggingface/text-embeddings-inference:100-1.9 (experimental)    |
+| Blackwell 12.0 (GeForce RTX 50X0, ...) | x86_64   | ghcr.io/huggingface/text-embeddings-inference:120-1.9 (experimental)    |
+| Blackwell 12.1 (DGX Spark GB10, ...)   | multi    | ghcr.io/huggingface/text-embeddings-inference:121-1.9 (experimental)    |
 
 **Warning**: Flash Attention is turned off by default for the Turing image as it suffers from precision issues.
 You can turn Flash Attention v1 ON by using the `USE_FLASH_ATTENTION=True` environment variable.
@@ -609,21 +611,40 @@ runtime_compute_cap=100
 # Example for Blackwell (GeForce RTX 50X0, RTX PRO 6000, ...)
 runtime_compute_cap=120
 
+# Example for Blackwell GB10 (DGX Spark)
+runtime_compute_cap=121
+
 docker build . -f Dockerfile-cuda --build-arg CUDA_COMPUTE_CAP=$runtime_compute_cap
 ```
 
-### Apple M1/M2 arm64 architectures
+### ARM64 / aarch64
 
-#### DISCLAIMER
+#### CPU-only (Apple Silicon, Ampere, Graviton)
 
-As explained here [MPS-Ready, ARM64 Docker Image](https://github.com/pytorch/pytorch/issues/81224), Metal / MPS is not
-supported via Docker. As such inference will be CPU bound and most likely pretty slow when using this docker image on an
-M1/M2 ARM CPU.
+For ARM64 hosts without NVIDIA GPUs, use the CPU Dockerfile. Inference runs on CPU cores
+only (no Metal/MPS support via Docker).
-For ARM64 hosts without NVIDIA GPUs, use the CPU Dockerfile. Inference runs on CPU cores
-only (no Metal/MPS support via Docker).
+For ARM64 hosts without NVIDIA GPUs such as Apple Silicon, use the `Dockerfile` for CPU,
+where inference will run without any accelerator, as Metal / MPS is not supported via Docker.
-For ARM64 hosts without NVIDIA GPUs, use the CPU Dockerfile. Inference runs on CPU cores
-only (no Metal/MPS support via Docker).
+For ARM64 hosts without NVIDIA GPUs such as Apple Silicon, use the `Dockerfile` for CPU,
+where inference will run without any accelerator, as Metal / MPS is not supported via Docker.
 
-```
+```shell
 docker build . -f Dockerfile-arm64 --platform=linux/arm64
 ```
 
+#### CUDA on ARM64 (DGX Spark, Jetson)
+
+For ARM64 hosts with NVIDIA GPUs, build `Dockerfile-cuda` with the appropriate compute
+capability and `--platform linux/arm64`:
-For ARM64 hosts with NVIDIA GPUs, build `Dockerfile-cuda` with the appropriate compute
-capability and `--platform linux/arm64`:
+For ARM64 hosts with NVIDIA GPUs, use / build the `Dockerfile-cuda` with `--platform linux/arm64`,
+and also with the `--build-arg CUDA_COMPUTE_CAP` set to whatever your instance compute capability is (only required when building the image).
-For ARM64 hosts with NVIDIA GPUs, build `Dockerfile-cuda` with the appropriate compute
-capability and `--platform linux/arm64`:
+For ARM64 hosts with NVIDIA GPUs, use / build the `Dockerfile-cuda` with `--platform linux/arm64`,
+and also with the `--build-arg CUDA_COMPUTE_CAP` set to whatever your instance compute capability is (only required when building the image).
+
+```shell
+# DGX Spark (GB10, sm_121)
+docker build . -f Dockerfile-cuda \
+  --build-arg CUDA_COMPUTE_CAP=121 \
+  --platform linux/arm64
+
+# Future ARM64 + Blackwell devices (sm_120)
+docker build . -f Dockerfile-cuda \
+  --build-arg CUDA_COMPUTE_CAP=120 \
+  --platform linux/arm64
+```
+
 ## Examples
 
 - [Set up an Inference Endpoint with TEI](https://huggingface.co/learn/cookbook/automatic_embedding_tei_inference_endpoints)

diff --git a/backends/candle/src/compute_cap.rs b/backends/candle/src/compute_cap.rs
@@ -31,7 +31,8 @@ fn compute_cap_matching(runtime_compute_cap: usize, compile_compute_cap: usize)
         (89, 89) => true,
         (90, 90) => true,
         (100, 100) => true,
-        (120, 120) => true,
+        (120..=121, 120) => true,
+        (121, 121) => true,
         (_, _) => false,
     }
 }
@@ -57,6 +58,8 @@ mod tests {
         assert!(compute_cap_matching(89, 89));
         assert!(compute_cap_matching(90, 90));
         assert!(compute_cap_matching(120, 120));
+        assert!(compute_cap_matching(121, 121));
+        assert!(compute_cap_matching(121, 120));
 
         assert!(compute_cap_matching(86, 80));
         assert!(compute_cap_matching(89, 80));
@@ -96,5 +99,12 @@ mod tests {
         assert!(!compute_cap_matching(120, 89));
         assert!(!compute_cap_matching(120, 90));
         assert!(!compute_cap_matching(120, 100));
+
+        assert!(!compute_cap_matching(121, 75));
+        assert!(!compute_cap_matching(121, 80));
+        assert!(!compute_cap_matching(121, 86));
+        assert!(!compute_cap_matching(121, 89));
+        assert!(!compute_cap_matching(121, 90));
+        assert!(!compute_cap_matching(121, 100));
     }
 }
diff --git a/backends/candle/src/flash_attn.rs b/backends/candle/src/flash_attn.rs
@@ -65,6 +65,7 @@ pub(crate) fn flash_attn_varlen(
         || runtime_compute_cap == 90
         || runtime_compute_cap == 100
         || runtime_compute_cap == 120
+        || runtime_compute_cap == 121
     {
         #[cfg(feature = "flash-attn")]
         {

diff --git a/docs/source/en/supported_models.md b/docs/source/en/supported_models.md
@@ -68,7 +68,8 @@ Below are some examples of the currently supported models:
 ## Supported hardware
 
 Text Embeddings Inference supports can be used on CPU, Turing (T4, RTX 2000 series, ...), Ampere 80 (A100, A30),
-Ampere 86 (A10, A40, ...), Ada Lovelace (RTX 4000 series, ...), Hopper (H100), and Blackwell (B200, ...) architectures.
+Ampere 86 (A10, A40, ...), Ada Lovelace (RTX 4000 series, ...), Hopper (H100), and Blackwell (B200, RTX 5090, DGX Spark, ...) architectures.
+ARM64 (aarch64) is supported for both CPU-only and CUDA (Blackwell 12.1) workloads.
 
 The library does **not** support CUDA compute capabilities < 7.5, which means V100, Titan V, GTX 1000 series, etc. are not supported.
 
@@ -78,17 +79,19 @@ NVIDIA drivers with CUDA version 12.2 or higher.
 
 Find the appropriate Docker image for your hardware in the following table:
 
-| Architecture                           | Image                                                                   |
-|----------------------------------------|-------------------------------------------------------------------------|
-| CPU                                    | ghcr.io/huggingface/text-embeddings-inference:cpu-1.9                   |
-| Volta                                  | NOT SUPPORTED                                                           |
-| Turing (T4, RTX 2000 series, ...)      | ghcr.io/huggingface/text-embeddings-inference:turing-1.9 (experimental) |
-| Ampere 8.0 (A100, A30)                 | ghcr.io/huggingface/text-embeddings-inference:1.9                       |
-| Ampere 8.6 (A10, A40, ...)             | ghcr.io/huggingface/text-embeddings-inference:86-1.9                    |
-| Ada Lovelace (RTX 4000 series, ...)    | ghcr.io/huggingface/text-embeddings-inference:89-1.9                    |
-| Hopper (H100)                          | ghcr.io/huggingface/text-embeddings-inference:hopper-1.9                |
-| Blackwell 10.0 (B200, GB200, ...)      | ghcr.io/huggingface/text-embeddings-inference:100-1.9 (experimental)    |
-| Blackwell 12.0 (GeForce RTX 50X0, ...) | ghcr.io/huggingface/text-embeddings-inference:120-1.9 (experimental)    |
+| Architecture                           | Platform | Image                                                                   |
+|----------------------------------------|----------|-------------------------------------------------------------------------|
+| CPU                                    | x86_64   | ghcr.io/huggingface/text-embeddings-inference:cpu-1.9                   |
+| CPU                                    | aarch64  | ghcr.io/huggingface/text-embeddings-inference:cpu-arm64-1.9             |
+| Volta                                  | x86_64   | NOT SUPPORTED                                                           |
+| Turing (T4, RTX 2000 series, ...)      | x86_64   | ghcr.io/huggingface/text-embeddings-inference:turing-1.9 (experimental) |
+| Ampere 8.0 (A100, A30)                 | x86_64   | ghcr.io/huggingface/text-embeddings-inference:1.9                       |
+| Ampere 8.6 (A10, A40, ...)             | x86_64   | ghcr.io/huggingface/text-embeddings-inference:86-1.9                    |
+| Ada Lovelace (RTX 4000 series, ...)    | x86_64   | ghcr.io/huggingface/text-embeddings-inference:89-1.9                    |
+| Hopper (H100)                          | x86_64   | ghcr.io/huggingface/text-embeddings-inference:hopper-1.9                |
+| Blackwell 10.0 (B200, GB200, ...)      | x86_64   | ghcr.io/huggingface/text-embeddings-inference:100-1.9 (experimental)    |
+| Blackwell 12.0 (GeForce RTX 50X0, ...) | x86_64   | ghcr.io/huggingface/text-embeddings-inference:120-1.9 (experimental)    |
+| Blackwell 12.1 (DGX Spark GB10, ...)   | multi    | ghcr.io/huggingface/text-embeddings-inference:121-1.9 (experimental)    |
 
 **Warning**: Flash Attention is turned off by default for the Turing image as it suffers from precision issues.
 You can turn Flash Attention v1 ON by using the `USE_FLASH_ATTENTION=True` environment variable.