diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 765918d6d..aeef1e5ed 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -126,7 +126,7 @@ jobs: context: . file: ${{ matrix.dockerfile }} push: ${{ github.event_name != 'pull_request' }} - platforms: "linux/amd64" + platforms: ${{ matrix.platforms || 'linux/amd64' }} build-args: | SCCACHE_GHA_ENABLED=${{ matrix.sccache }} CUDA_COMPUTE_CAP=${{ matrix.cudaComputeCap }} @@ -168,7 +168,7 @@ jobs: target: grpc file: ${{ matrix.dockerfile }} push: ${{ github.event_name != 'pull_request' }} - platforms: "linux/amd64" + platforms: ${{ matrix.platforms || 'linux/amd64' }} build-args: | SCCACHE_GHA_ENABLED=${{ matrix.sccache }} CUDA_COMPUTE_CAP=${{ matrix.cudaComputeCap }} diff --git a/.github/workflows/matrix.json b/.github/workflows/matrix.json index 9449b53be..b07515964 100644 --- a/.github/workflows/matrix.json +++ b/.github/workflows/matrix.json @@ -63,6 +63,16 @@ "grpc": true, "dockerfile": "Dockerfile-cuda" }, + { + "name": "blackwell-121", + "imageNamePrefix": "121-", + "runOn": "always", + "sccache": true, + "cudaComputeCap": 121, + "platforms": "linux/amd64,linux/arm64", + "grpc": true, + "dockerfile": "Dockerfile-cuda" + }, { "name": "all", "imageNamePrefix": "cuda-", @@ -79,6 +89,15 @@ "grpc": true, "dockerfile": "Dockerfile" }, + { + "name": "cpu-arm64", + "imageNamePrefix": "cpu-arm64-", + "runOn": "always", + "sccache": true, + "platforms": "linux/arm64", + "grpc": true, + "dockerfile": "Dockerfile-arm64" + }, { "name": "cpu-ipex", "imageNamePrefix": "cpu-ipex-", diff --git a/Dockerfile-cuda b/Dockerfile-cuda index 489f0f444..9c08835c7 100644 --- a/Dockerfile-cuda +++ b/Dockerfile-cuda @@ -12,8 +12,14 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins pkg-config \ && rm -rf /var/lib/apt/lists/* -# Donwload and configure sccache -RUN curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache && \ +# Download and configure sccache (multi-arch) +ARG TARGETARCH +RUN case "${TARGETARCH}" in \ + "amd64") SCCACHE_ARCH=x86_64-unknown-linux-musl ;; \ + "arm64") SCCACHE_ARCH=aarch64-unknown-linux-musl ;; \ + *) echo "Unsupported arch: ${TARGETARCH}"; exit 1 ;; \ + esac && \ + curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-${SCCACHE_ARCH}.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-${SCCACHE_ARCH}/sccache && \ chmod +x /usr/local/bin/sccache COPY rust-toolchain.toml rust-toolchain.toml @@ -65,6 +71,9 @@ RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \ elif [ ${CUDA_COMPUTE_CAP} -eq 120 ]; \ then \ nvprune --generate-code code=sm_120 /usr/local/cuda/lib64/libcublas_static.a -o /usr/local/cuda/lib64/libcublas_static.a; \ + elif [ ${CUDA_COMPUTE_CAP} -eq 121 ]; \ + then \ + nvprune --generate-code code=sm_120 --generate-code code=sm_121 /usr/local/cuda/lib64/libcublas_static.a -o /usr/local/cuda/lib64/libcublas_static.a; \ else \ echo "cuda compute cap ${CUDA_COMPUTE_CAP} is not supported"; exit 1; \ fi; @@ -103,7 +112,13 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins unzip \ && rm -rf /var/lib/apt/lists/* -RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \ +ARG TARGETARCH +RUN case "${TARGETARCH}" in \ + "amd64") PROTOC_ARCH=x86_64 ;; \ + "arm64") PROTOC_ARCH=aarch_64 ;; \ + *) echo "Unsupported arch: ${TARGETARCH}"; exit 1 ;; \ + esac && \ + PROTOC_ZIP=protoc-21.12-linux-${PROTOC_ARCH}.zip && \ curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \ unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \ unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \ diff --git a/README.md b/README.md index 046fae484..d569dfcc4 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ length of 512 tokens: - [Local Install](#local-install) - [Apple Silicon (Homebrew)](#apple-silicon-homebrew) - [Docker Build](#docker-build) - - [Apple M1/M2 Arm](#apple-m1m2-arm64-architectures) + - [ARM64 / aarch64](#arm64--aarch64) - [Examples](#examples) Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence @@ -336,17 +336,19 @@ Options: Text Embeddings Inference ships with multiple Docker images that you can use to target a specific backend: -| Architecture | Image | -|----------------------------------------|-------------------------------------------------------------------------| -| CPU | ghcr.io/huggingface/text-embeddings-inference:cpu-1.9 | -| Volta | NOT SUPPORTED | -| Turing (T4, RTX 2000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:turing-1.9 (experimental) | -| Ampere 8.0 (A100, A30) | ghcr.io/huggingface/text-embeddings-inference:1.9 | -| Ampere 8.6 (A10, A40, ...) | ghcr.io/huggingface/text-embeddings-inference:86-1.9 | -| Ada Lovelace (RTX 4000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:89-1.9 | -| Hopper (H100) | ghcr.io/huggingface/text-embeddings-inference:hopper-1.9 | -| Blackwell 10.0 (B200, GB200, ...) | ghcr.io/huggingface/text-embeddings-inference:100-1.9 (experimental) | -| Blackwell 12.0 (GeForce RTX 50X0, ...) | ghcr.io/huggingface/text-embeddings-inference:120-1.9 (experimental) | +| Architecture | Platform | Image | +|----------------------------------------|----------|-------------------------------------------------------------------------| +| CPU | x86_64 | ghcr.io/huggingface/text-embeddings-inference:cpu-1.9 | +| CPU | aarch64 | ghcr.io/huggingface/text-embeddings-inference:cpu-arm64-1.9 | +| Volta | x86_64 | NOT SUPPORTED | +| Turing (T4, RTX 2000 series, ...) | x86_64 | ghcr.io/huggingface/text-embeddings-inference:turing-1.9 (experimental) | +| Ampere 8.0 (A100, A30) | x86_64 | ghcr.io/huggingface/text-embeddings-inference:1.9 | +| Ampere 8.6 (A10, A40, ...) | x86_64 | ghcr.io/huggingface/text-embeddings-inference:86-1.9 | +| Ada Lovelace (RTX 4000 series, ...) | x86_64 | ghcr.io/huggingface/text-embeddings-inference:89-1.9 | +| Hopper (H100) | x86_64 | ghcr.io/huggingface/text-embeddings-inference:hopper-1.9 | +| Blackwell 10.0 (B200, GB200, ...) | x86_64 | ghcr.io/huggingface/text-embeddings-inference:100-1.9 (experimental) | +| Blackwell 12.0 (GeForce RTX 50X0, ...) | x86_64 | ghcr.io/huggingface/text-embeddings-inference:120-1.9 (experimental) | +| Blackwell 12.1 (DGX Spark GB10, ...) | multi | ghcr.io/huggingface/text-embeddings-inference:121-1.9 (experimental) | **Warning**: Flash Attention is turned off by default for the Turing image as it suffers from precision issues. You can turn Flash Attention v1 ON by using the `USE_FLASH_ATTENTION=True` environment variable. @@ -609,21 +611,40 @@ runtime_compute_cap=100 # Example for Blackwell (GeForce RTX 50X0, RTX PRO 6000, ...) runtime_compute_cap=120 +# Example for Blackwell GB10 (DGX Spark) +runtime_compute_cap=121 + docker build . -f Dockerfile-cuda --build-arg CUDA_COMPUTE_CAP=$runtime_compute_cap ``` -### Apple M1/M2 arm64 architectures +### ARM64 / aarch64 -#### DISCLAIMER +#### CPU-only (Apple Silicon, Ampere, Graviton) -As explained here [MPS-Ready, ARM64 Docker Image](https://github.com/pytorch/pytorch/issues/81224), Metal / MPS is not -supported via Docker. As such inference will be CPU bound and most likely pretty slow when using this docker image on an -M1/M2 ARM CPU. +For ARM64 hosts without NVIDIA GPUs, use the CPU Dockerfile. Inference runs on CPU cores +only (no Metal/MPS support via Docker). -``` +```shell docker build . -f Dockerfile-arm64 --platform=linux/arm64 ``` +#### CUDA on ARM64 (DGX Spark, Jetson) + +For ARM64 hosts with NVIDIA GPUs, build `Dockerfile-cuda` with the appropriate compute +capability and `--platform linux/arm64`: + +```shell +# DGX Spark (GB10, sm_121) +docker build . -f Dockerfile-cuda \ + --build-arg CUDA_COMPUTE_CAP=121 \ + --platform linux/arm64 + +# Future ARM64 + Blackwell devices (sm_120) +docker build . -f Dockerfile-cuda \ + --build-arg CUDA_COMPUTE_CAP=120 \ + --platform linux/arm64 +``` + ## Examples - [Set up an Inference Endpoint with TEI](https://huggingface.co/learn/cookbook/automatic_embedding_tei_inference_endpoints) diff --git a/backends/candle/src/compute_cap.rs b/backends/candle/src/compute_cap.rs index 782806279..57684c76e 100644 --- a/backends/candle/src/compute_cap.rs +++ b/backends/candle/src/compute_cap.rs @@ -31,7 +31,8 @@ fn compute_cap_matching(runtime_compute_cap: usize, compile_compute_cap: usize) (89, 89) => true, (90, 90) => true, (100, 100) => true, - (120, 120) => true, + (120..=121, 120) => true, + (121, 121) => true, (_, _) => false, } } @@ -57,6 +58,8 @@ mod tests { assert!(compute_cap_matching(89, 89)); assert!(compute_cap_matching(90, 90)); assert!(compute_cap_matching(120, 120)); + assert!(compute_cap_matching(121, 121)); + assert!(compute_cap_matching(121, 120)); assert!(compute_cap_matching(86, 80)); assert!(compute_cap_matching(89, 80)); @@ -96,5 +99,12 @@ mod tests { assert!(!compute_cap_matching(120, 89)); assert!(!compute_cap_matching(120, 90)); assert!(!compute_cap_matching(120, 100)); + + assert!(!compute_cap_matching(121, 75)); + assert!(!compute_cap_matching(121, 80)); + assert!(!compute_cap_matching(121, 86)); + assert!(!compute_cap_matching(121, 89)); + assert!(!compute_cap_matching(121, 90)); + assert!(!compute_cap_matching(121, 100)); } } diff --git a/backends/candle/src/flash_attn.rs b/backends/candle/src/flash_attn.rs index 5793554bd..0d1046ea3 100644 --- a/backends/candle/src/flash_attn.rs +++ b/backends/candle/src/flash_attn.rs @@ -65,6 +65,7 @@ pub(crate) fn flash_attn_varlen( || runtime_compute_cap == 90 || runtime_compute_cap == 100 || runtime_compute_cap == 120 + || runtime_compute_cap == 121 { #[cfg(feature = "flash-attn")] { diff --git a/docs/source/en/supported_models.md b/docs/source/en/supported_models.md index fb3b97143..3df7717fd 100644 --- a/docs/source/en/supported_models.md +++ b/docs/source/en/supported_models.md @@ -68,7 +68,8 @@ Below are some examples of the currently supported models: ## Supported hardware Text Embeddings Inference supports can be used on CPU, Turing (T4, RTX 2000 series, ...), Ampere 80 (A100, A30), -Ampere 86 (A10, A40, ...), Ada Lovelace (RTX 4000 series, ...), Hopper (H100), and Blackwell (B200, ...) architectures. +Ampere 86 (A10, A40, ...), Ada Lovelace (RTX 4000 series, ...), Hopper (H100), and Blackwell (B200, RTX 5090, DGX Spark, ...) architectures. +ARM64 (aarch64) is supported for both CPU-only and CUDA (Blackwell 12.1) workloads. The library does **not** support CUDA compute capabilities < 7.5, which means V100, Titan V, GTX 1000 series, etc. are not supported. @@ -78,17 +79,19 @@ NVIDIA drivers with CUDA version 12.2 or higher. Find the appropriate Docker image for your hardware in the following table: -| Architecture | Image | -|----------------------------------------|-------------------------------------------------------------------------| -| CPU | ghcr.io/huggingface/text-embeddings-inference:cpu-1.9 | -| Volta | NOT SUPPORTED | -| Turing (T4, RTX 2000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:turing-1.9 (experimental) | -| Ampere 8.0 (A100, A30) | ghcr.io/huggingface/text-embeddings-inference:1.9 | -| Ampere 8.6 (A10, A40, ...) | ghcr.io/huggingface/text-embeddings-inference:86-1.9 | -| Ada Lovelace (RTX 4000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:89-1.9 | -| Hopper (H100) | ghcr.io/huggingface/text-embeddings-inference:hopper-1.9 | -| Blackwell 10.0 (B200, GB200, ...) | ghcr.io/huggingface/text-embeddings-inference:100-1.9 (experimental) | -| Blackwell 12.0 (GeForce RTX 50X0, ...) | ghcr.io/huggingface/text-embeddings-inference:120-1.9 (experimental) | +| Architecture | Platform | Image | +|----------------------------------------|----------|-------------------------------------------------------------------------| +| CPU | x86_64 | ghcr.io/huggingface/text-embeddings-inference:cpu-1.9 | +| CPU | aarch64 | ghcr.io/huggingface/text-embeddings-inference:cpu-arm64-1.9 | +| Volta | x86_64 | NOT SUPPORTED | +| Turing (T4, RTX 2000 series, ...) | x86_64 | ghcr.io/huggingface/text-embeddings-inference:turing-1.9 (experimental) | +| Ampere 8.0 (A100, A30) | x86_64 | ghcr.io/huggingface/text-embeddings-inference:1.9 | +| Ampere 8.6 (A10, A40, ...) | x86_64 | ghcr.io/huggingface/text-embeddings-inference:86-1.9 | +| Ada Lovelace (RTX 4000 series, ...) | x86_64 | ghcr.io/huggingface/text-embeddings-inference:89-1.9 | +| Hopper (H100) | x86_64 | ghcr.io/huggingface/text-embeddings-inference:hopper-1.9 | +| Blackwell 10.0 (B200, GB200, ...) | x86_64 | ghcr.io/huggingface/text-embeddings-inference:100-1.9 (experimental) | +| Blackwell 12.0 (GeForce RTX 50X0, ...) | x86_64 | ghcr.io/huggingface/text-embeddings-inference:120-1.9 (experimental) | +| Blackwell 12.1 (DGX Spark GB10, ...) | multi | ghcr.io/huggingface/text-embeddings-inference:121-1.9 (experimental) | **Warning**: Flash Attention is turned off by default for the Turing image as it suffers from precision issues. You can turn Flash Attention v1 ON by using the `USE_FLASH_ATTENTION=True` environment variable.