Release Neuronx TGI 0.0.22 (#74)

Captainia · web-flow · commit dcefe1243167 · 2024-05-13T11:41:13.000-07:00
* release 0.0.22

* increase test timeout

* update test to use Llama2-7b again

* test 0.0.21

* placeholder change

* update

* update

* trigger tests
diff --git a/huggingface/pytorch/optimum/docker/0.0.22/Dockerfile b/huggingface/pytorch/optimum/docker/0.0.22/Dockerfile
@@ -0,0 +1,164 @@
+# Fetch and extract the TGI sources (TGI_VERSION is mandatory)
+FROM alpine AS tgi
+RUN mkdir -p /tgi
+ADD https://github.com/huggingface/text-generation-inference/archive/refs/tags/v2.0.2.tar.gz /tgi/sources.tar.gz
+RUN tar -C /tgi -xf /tgi/sources.tar.gz --strip-components=1
+
+# Build cargo components (adapted from TGI original Dockerfile)
+# Note that the build image is aligned on the same Linux version as the base image (Debian bookworm/ Ubuntu 22.04)
+FROM lukemathwalker/cargo-chef:latest-rust-1.75-bookworm AS chef
+WORKDIR /usr/src
+
+ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
+
+FROM chef as planner
+COPY --from=tgi /tgi/Cargo.toml Cargo.toml
+COPY --from=tgi /tgi/rust-toolchain.toml rust-toolchain.toml
+COPY --from=tgi /tgi/proto proto
+COPY --from=tgi /tgi/benchmark benchmark
+COPY --from=tgi /tgi/router router
+COPY --from=tgi /tgi/launcher launcher
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM chef AS builder
+
+RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
+    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
+    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
+    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
+    rm -f $PROTOC_ZIP
+
+COPY --from=planner /usr/src/recipe.json recipe.json
+RUN cargo chef cook --release --recipe-path recipe.json
+
+COPY --from=tgi /tgi/Cargo.toml Cargo.toml
+COPY --from=tgi /tgi/rust-toolchain.toml rust-toolchain.toml
+COPY --from=tgi /tgi/proto proto
+COPY --from=tgi /tgi/benchmark benchmark
+COPY --from=tgi /tgi/router router
+COPY --from=tgi /tgi/launcher launcher
+RUN cargo build --release --workspace --exclude benchmark
+
+# Fetch optimum-neuron sources
+FROM alpine/git AS optimum-neuron
+RUN git clone --depth 1 --branch v0.0.22 https://github.com/huggingface/optimum-neuron.git /optimum-neuron
+
+# Python base image
+FROM ubuntu:22.04 AS base
+
+RUN apt-get update -y \
+    && apt-get install -y --no-install-recommends \
+    python3-pip \
+    python3-setuptools \
+    python-is-python3 \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+RUN pip3 --no-cache-dir install --upgrade pip
+
+# Python server build image
+FROM base AS pyserver
+
+RUN apt-get update -y \
+    && apt-get install -y --no-install-recommends \
+    make \
+    python3-venv \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+RUN install -d /pyserver
+WORKDIR /pyserver
+COPY --from=optimum-neuron /optimum-neuron/text-generation-inference/server server
+COPY --from=tgi /tgi/proto proto
+RUN pip3 install -r server/build-requirements.txt
+RUN VERBOSE=1 BUILDDIR=/pyserver/build PROTODIR=/pyserver/proto make -C server gen-server
+
+# Neuron base image (used for deployment)
+FROM base AS neuron
+
+# Install system prerequisites
+RUN apt-get update -y \
+    && apt-get install -y --no-install-recommends \
+    gnupg2 \
+    wget \
+    python3-dev \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+RUN echo "deb https://apt.repos.neuron.amazonaws.com jammy main" > /etc/apt/sources.list.d/neuron.list
+RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
+
+# Install neuronx packages
+RUN apt-get update -y \
+    && apt-get install -y --no-install-recommends \
+    aws-neuronx-dkms=2.16.7.0 \
+    aws-neuronx-collectives=2.20.22.0-c101c322e \
+    aws-neuronx-runtime-lib=2.20.22.0-1b3ca6425 \
+    aws-neuronx-tools=2.17.1.0 \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}"
+
+RUN pip3 install \
+    neuronx-cc==2.13.66.0 \
+    torch-neuronx==2.1.2.2.1.0 \
+    transformers-neuronx==0.10.0.21 \
+    --extra-index-url=https://pip.repos.neuron.amazonaws.com
+
+# Install HuggingFace packages
+RUN pip3 install \
+    hf_transfer huggingface_hub
+
+# Install optimum-neuron
+COPY --from=optimum-neuron /optimum-neuron optimum-neuron
+RUN pip3 install ./optimum-neuron
+
+# TGI base env
+ENV HUGGINGFACE_HUB_CACHE=/tmp \
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
+    PORT=80
+
+# Disable color logs as they are not supported by CloudWatch
+ENV LOGURU_COLORIZE=NO
+
+# Install router
+COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router
+# Install launcher
+COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
+# Install python server
+COPY --from=pyserver /pyserver/build/dist dist
+RUN pip install dist/text_generation_server*.tar.gz
+
+# AWS Sagemaker compatible image
+FROM neuron as sagemaker
+
+COPY --from=optimum-neuron /optimum-neuron/text-generation-inference/sagemaker-entrypoint.sh entrypoint.sh
+RUN chmod +x entrypoint.sh
+
+ENTRYPOINT ["./entrypoint.sh"]
+
+
+RUN apt-get update && apt-get install -y --no-install-recommends curl unzip \
+    && rm -rf /var/lib/apt/lists/*
+RUN HOME_DIR=/root && \
+    pip install requests && \
+    curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip && \
+    unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ && \
+    cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance && \
+    chmod +x /usr/local/bin/testOSSCompliance && \
+    chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \
+    ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \
+    rm -rf ${HOME_DIR}/oss_compliance*
+
+RUN echo "N.B.: Although this image is released under the Apache-2.0 License, the Dockerfile used to build the image \
+    has an indirect documentation dependency on third party <docutils/tools/editors/emacs/rst.el> project. The \
+    <docutils/tools/editors/emacs/rst.el> project's licensing includes the <GPL v3> license. \
+    \n\n\
+    N.B.: Although this image is released under the Apache-2.0 License, the Dockerfile used to build the image uses the \
+    third party <Text Generation Inference (TGI)> project. The <Text Generation Inference (TGI)> project's licensing \
+    includes the <HFOIL --> https://github.com/huggingface/text-generation-inference/blob/main/LICENSE> \
+    license." > /root/THIRD_PARTY_LICENSES
+
+LABEL dlc_major_version="1"
+LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.huggingface.tgi="true"
+LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port="true"
diff --git a/releases.json b/releases.json
@@ -43,17 +43,24 @@
             "os_version": "ubuntu22.04",
             "python_version": "py310",
             "pytorch_version": "1.13.1"
+        },
+        {
+            "device": "inf2",
+            "min_version": "0.0.22",
+            "max_version": "0.0.22",
+            "os_version": "ubuntu22.04",
+            "python_version": "py310",
+            "pytorch_version": "2.1.2"
         }
     ],
     "ignore_vulnerabilities": [],
     "releases": [
         {
-            "device": "gpu",
-            "version": "2.0.2",
+            "device": "inf2",
+            "version": "0.0.22",
             "os_version": "ubuntu22.04",
             "python_version": "py310",
-            "cuda_version": "cu121",
-            "pytorch_version": "2.3.0"
+            "pytorch_version": "2.1.2"
         }
     ]
 }
diff --git a/tests/huggingface/sagemaker_dlc_test.py b/tests/huggingface/sagemaker_dlc_test.py
@@ -79,7 +79,7 @@ def run_test(args):
     pytest.param("google/flan-t5-xxl", None, "ml.g5.12xlarge", marks=pytest.mark.gpu),
     pytest.param("HuggingFaceTB/cosmo-1b", None, "ml.inf2.8xlarge", marks=pytest.mark.inf2),
 ])
-def test(model_id: str, model_revision: str, instance_type: str, timeout: str = "1800"):
+def test(model_id: str, model_revision: str, instance_type: str, timeout: str = "2400"):
     image_uri = os.getenv("IMAGE_URI")
     test_role_arn = os.getenv("TEST_ROLE_ARN")
     assert image_uri, f"Please set IMAGE_URI environment variable."
@@ -92,7 +92,7 @@ def test(model_id: str, model_revision: str, instance_type: str, timeout: str =
         role=test_role_arn,
         timeout=timeout)
 
-    logging.info(f"Running sanity test with the following args: {args}.")
+    logging.info(f"Running sanity test with the following arguments: {args}.")
     run_test(args)
 
 

Original file line number	Diff line number	Diff line change
`@@ -43,17 +43,24 @@`
`43`	`43`	`"os_version": "ubuntu22.04",`
`44`	`44`	`"python_version": "py310",`
`45`	`45`	`"pytorch_version": "1.13.1"`
	`46`	`+ },`
	`47`	`+ {`
	`48`	`+ "device": "inf2",`
	`49`	`+ "min_version": "0.0.22",`
	`50`	`+ "max_version": "0.0.22",`
	`51`	`+ "os_version": "ubuntu22.04",`
	`52`	`+ "python_version": "py310",`
	`53`	`+ "pytorch_version": "2.1.2"`
`46`	`54`	`}`
`47`	`55`	`],`
`48`	`56`	`"ignore_vulnerabilities": [],`
`49`	`57`	`"releases": [`
`50`	`58`	`{`
`51`		`- "device": "gpu",`
`52`		`- "version": "2.0.2",`
	`59`	`+ "device": "inf2",`
	`60`	`+ "version": "0.0.22",`
`53`	`61`	`"os_version": "ubuntu22.04",`
`54`	`62`	`"python_version": "py310",`
`55`		`- "cuda_version": "cu121",`
`56`		`- "pytorch_version": "2.3.0"`
	`63`	`+ "pytorch_version": "2.1.2"`
`57`	`64`	`}`
`58`	`65`	`]`
`59`	`66`	`}`