diff --git a/.github/workflows/image-pr.yml b/.github/workflows/image-pr.yml index 527a8479ee39..2e9a0afee511 100644 --- a/.github/workflows/image-pr.yml +++ b/.github/workflows/image-pr.yml @@ -59,6 +59,14 @@ jobs: image-type: 'extras' base-image: "rocm/dev-ubuntu-22.04:6.0-complete" runs-on: 'arc-runner-set' + - build-type: 'sycl_f16' + platforms: 'linux/amd64' + tag-latest: 'false' + base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04" + tag-suffix: 'sycl-f16-ffmpeg' + ffmpeg: 'true' + image-type: 'extras' + runs-on: 'arc-runner-set' core-image-build: uses: ./.github/workflows/image_build.yml with: @@ -105,4 +113,4 @@ jobs: ffmpeg: 'true' image-type: 'core' runs-on: 'ubuntu-latest' - base-image: "ubuntu:22.04" + base-image: "ubuntu:22.04" \ No newline at end of file diff --git a/.github/workflows/image.yml b/.github/workflows/image.yml index a9620baa5643..2a7fac27a377 100644 --- a/.github/workflows/image.yml +++ b/.github/workflows/image.yml @@ -120,6 +120,22 @@ jobs: image-type: 'extras' base-image: "rocm/dev-ubuntu-22.04:6.0-complete" runs-on: 'arc-runner-set' + - build-type: 'sycl_f16' + platforms: 'linux/amd64' + tag-latest: 'false' + base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04" + tag-suffix: '-sycl-f16-ffmpeg' + ffmpeg: 'true' + image-type: 'extras' + runs-on: 'arc-runner-set' + - build-type: 'sycl_f32' + platforms: 'linux/amd64' + tag-latest: 'false' + base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04" + tag-suffix: '-sycl-f32-ffmpeg' + ffmpeg: 'true' + image-type: 'extras' + runs-on: 'arc-runner-set' # Core images - build-type: 'sycl_f16' platforms: 'linux/amd64' diff --git a/Dockerfile b/Dockerfile index a04a866ec7d1..fd3659629395 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,6 +4,8 @@ ARG BASE_IMAGE=ubuntu:22.04 # extras or core FROM ${BASE_IMAGE} as requirements-core +USER root + ARG GO_VERSION=1.21.7 ARG BUILD_TYPE ARG CUDA_MAJOR_VERSION=11 @@ -21,7 +23,7 @@ RUN apt-get update && \ apt-get install -y ca-certificates curl patch pip cmake git && apt-get clean # Install Go -RUN curl -L -s https://go.dev/dl/go$GO_VERSION.linux-$TARGETARCH.tar.gz | tar -v -C /usr/local -xz +RUN curl -L -s https://go.dev/dl/go$GO_VERSION.linux-$TARGETARCH.tar.gz | tar -C /usr/local -xz ENV PATH $PATH:/usr/local/go/bin COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/ @@ -79,6 +81,10 @@ RUN pip install --upgrade pip RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y RUN apt-get install -y espeak-ng espeak && apt-get clean +RUN if [ ! -e /usr/bin/python ]; then \ + ln -s /usr/bin/python3 /usr/bin/python \ + ; fi + ################################### ################################### @@ -166,43 +172,43 @@ COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/ ## Duplicated from Makefile to avoid having a big layer that's hard to push RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \ - PATH=$PATH:/opt/conda/bin make -C backend/python/autogptq \ + make -C backend/python/autogptq \ ; fi RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \ - PATH=$PATH:/opt/conda/bin make -C backend/python/bark \ + make -C backend/python/bark \ ; fi RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \ - PATH=$PATH:/opt/conda/bin make -C backend/python/diffusers \ + make -C backend/python/diffusers \ ; fi RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \ - PATH=$PATH:/opt/conda/bin make -C backend/python/vllm \ + make -C backend/python/vllm \ ; fi RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \ - PATH=$PATH:/opt/conda/bin make -C backend/python/mamba \ + make -C backend/python/mamba \ ; fi RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \ - PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers \ + make -C backend/python/sentencetransformers \ ; fi RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \ - PATH=$PATH:/opt/conda/bin make -C backend/python/transformers \ + make -C backend/python/transformers \ ; fi RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \ - PATH=$PATH:/opt/conda/bin make -C backend/python/vall-e-x \ + make -C backend/python/vall-e-x \ ; fi RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \ - PATH=$PATH:/opt/conda/bin make -C backend/python/exllama \ + make -C backend/python/exllama \ ; fi RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \ - PATH=$PATH:/opt/conda/bin make -C backend/python/exllama2 \ + make -C backend/python/exllama2 \ ; fi RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \ - PATH=$PATH:/opt/conda/bin make -C backend/python/petals \ + make -C backend/python/petals \ ; fi RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \ - PATH=$PATH:/opt/conda/bin make -C backend/python/transformers-musicgen \ + make -C backend/python/transformers-musicgen \ ; fi RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \ - PATH=$PATH:/opt/conda/bin make -C backend/python/coqui \ + make -C backend/python/coqui \ ; fi # Make sure the models directory exists diff --git a/Makefile b/Makefile index b24ed7972151..324aedfd8d2a 100644 --- a/Makefile +++ b/Makefile @@ -557,3 +557,10 @@ docker-image-intel: --build-arg IMAGE_TYPE=$(IMAGE_TYPE) \ --build-arg GO_TAGS="none" \ --build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) . + +docker-image-intel-xpu: + docker build \ + --build-arg BASE_IMAGE=intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04 \ + --build-arg IMAGE_TYPE=$(IMAGE_TYPE) \ + --build-arg GO_TAGS="none" \ + --build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) . \ No newline at end of file diff --git a/backend/python/common-env/transformers/Makefile b/backend/python/common-env/transformers/Makefile index 1cd71ab177d3..797af0832ef2 100644 --- a/backend/python/common-env/transformers/Makefile +++ b/backend/python/common-env/transformers/Makefile @@ -8,6 +8,13 @@ ifeq ($(BUILD_TYPE), hipblas) CONDA_ENV_PATH = "transformers-rocm.yml" endif +# Intel GPU are supposed to have dependencies installed in the main python +# environment, so we skip conda installation for SYCL builds. +# https://github.com/intel/intel-extension-for-pytorch/issues/538 +ifneq (,$(findstring sycl,$(BUILD_TYPE))) +export SKIP_CONDA=1 +endif + .PHONY: transformers transformers: @echo "Installing $(CONDA_ENV_PATH)..." diff --git a/backend/python/common-env/transformers/install.sh b/backend/python/common-env/transformers/install.sh index 42965bdbc68e..e268fcc88370 100644 --- a/backend/python/common-env/transformers/install.sh +++ b/backend/python/common-env/transformers/install.sh @@ -1,24 +1,38 @@ #!/bin/bash set -ex +SKIP_CONDA=${SKIP_CONDA:-0} + # Check if environment exist conda_env_exists(){ ! conda list --name "${@}" >/dev/null 2>/dev/null } -if conda_env_exists "transformers" ; then - echo "Creating virtual environment..." - conda env create --name transformers --file $1 - echo "Virtual environment created." -else - echo "Virtual environment already exists." +if [ $SKIP_CONDA -eq 1 ]; then + echo "Skipping conda environment installation" +else + export PATH=$PATH:/opt/conda/bin + if conda_env_exists "transformers" ; then + echo "Creating virtual environment..." + conda env create --name transformers --file $1 + echo "Virtual environment created." + else + echo "Virtual environment already exists." + fi fi -if [ "$PIP_CACHE_PURGE" = true ] ; then - export PATH=$PATH:/opt/conda/bin +if [ -d "/opt/intel" ]; then + # Intel GPU: If the directory exists, we assume we are using the intel image + # (no conda env) + # https://github.com/intel/intel-extension-for-pytorch/issues/538 + pip install intel-extension-for-transformers datasets sentencepiece tiktoken neural_speed +fi - # Activate conda environment - source activate transformers +if [ "$PIP_CACHE_PURGE" = true ] ; then + if [ $SKIP_CONDA -eq 0 ]; then + # Activate conda environment + source activate transformers + fi pip cache purge fi \ No newline at end of file diff --git a/backend/python/diffusers/Makefile b/backend/python/diffusers/Makefile index 70a62b60daa9..40e1d1a7e888 100644 --- a/backend/python/diffusers/Makefile +++ b/backend/python/diffusers/Makefile @@ -4,6 +4,13 @@ ifeq ($(BUILD_TYPE), hipblas) export CONDA_ENV_PATH = "diffusers-rocm.yml" endif +# Intel GPU are supposed to have dependencies installed in the main python +# environment, so we skip conda installation for SYCL builds. +# https://github.com/intel/intel-extension-for-pytorch/issues/538 +ifneq (,$(findstring sycl,$(BUILD_TYPE))) +export SKIP_CONDA=1 +endif + .PHONY: diffusers diffusers: @echo "Installing $(CONDA_ENV_PATH)..." diff --git a/backend/python/diffusers/backend_diffusers.py b/backend/python/diffusers/backend_diffusers.py index 6780cae626a6..ec2dea6050e0 100755 --- a/backend/python/diffusers/backend_diffusers.py +++ b/backend/python/diffusers/backend_diffusers.py @@ -21,14 +21,15 @@ from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline from diffusers.pipelines.stable_diffusion import safety_checker from diffusers.utils import load_image,export_to_video -from compel import Compel +from compel import Compel, ReturnedEmbeddingsType from transformers import CLIPTextModel from safetensors.torch import load_file _ONE_DAY_IN_SECONDS = 60 * 60 * 24 -COMPEL=os.environ.get("COMPEL", "1") == "1" +COMPEL=os.environ.get("COMPEL", "0") == "1" +XPU=os.environ.get("XPU", "0") == "1" CLIPSKIP=os.environ.get("CLIPSKIP", "1") == "1" SAFETENSORS=os.environ.get("SAFETENSORS", "1") == "1" CHUNK_SIZE=os.environ.get("CHUNK_SIZE", "8") @@ -36,6 +37,10 @@ DISABLE_CPU_OFFLOAD=os.environ.get("DISABLE_CPU_OFFLOAD", "0") == "1" FRAMES=os.environ.get("FRAMES", "64") +if XPU: + import intel_extension_for_pytorch as ipex + print(ipex.xpu.get_device_name(0)) + # If MAX_WORKERS are specified in the environment use it, otherwise default to 1 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) @@ -231,8 +236,13 @@ def LoadModel(self, request, context): if request.SchedulerType != "": self.pipe.scheduler = get_scheduler(request.SchedulerType, self.pipe.scheduler.config) - if not self.img2vid: - self.compel = Compel(tokenizer=self.pipe.tokenizer, text_encoder=self.pipe.text_encoder) + if COMPEL: + self.compel = Compel( + tokenizer=[self.pipe.tokenizer, self.pipe.tokenizer_2 ], + text_encoder=[self.pipe.text_encoder, self.pipe.text_encoder_2], + returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED, + requires_pooled=[False, True] + ) if request.ControlNet: @@ -247,6 +257,8 @@ def LoadModel(self, request, context): self.pipe.to('cuda') if self.controlnet: self.controlnet.to('cuda') + if XPU: + self.pipe = self.pipe.to("xpu") # Assume directory from request.ModelFile. # Only if request.LoraAdapter it's not an absolute path if request.LoraAdapter and request.ModelFile != "" and not os.path.isabs(request.LoraAdapter) and request.LoraAdapter: @@ -386,8 +398,9 @@ def GenerateImage(self, request, context): image = {} if COMPEL: - conditioning = self.compel.build_conditioning_tensor(prompt) - kwargs["prompt_embeds"]= conditioning + conditioning, pooled = self.compel.build_conditioning_tensor(prompt) + kwargs["prompt_embeds"] = conditioning + kwargs["pooled_prompt_embeds"] = pooled # pass the kwargs dictionary to the self.pipe method image = self.pipe( guidance_scale=self.cfg_scale, diff --git a/backend/python/diffusers/install.sh b/backend/python/diffusers/install.sh index 0429826e3f4d..d83ec0be0b3b 100755 --- a/backend/python/diffusers/install.sh +++ b/backend/python/diffusers/install.sh @@ -1,24 +1,50 @@ #!/bin/bash set -ex +SKIP_CONDA=${SKIP_CONDA:-0} + # Check if environment exist conda_env_exists(){ ! conda list --name "${@}" >/dev/null 2>/dev/null } -if conda_env_exists "diffusers" ; then - echo "Creating virtual environment..." - conda env create --name diffusers --file $1 - echo "Virtual environment created." -else - echo "Virtual environment already exists." +if [ $SKIP_CONDA -eq 1 ]; then + echo "Skipping conda environment installation" +else + export PATH=$PATH:/opt/conda/bin + if conda_env_exists "diffusers" ; then + echo "Creating virtual environment..." + conda env create --name diffusers --file $1 + echo "Virtual environment created." + else + echo "Virtual environment already exists." + fi fi -if [ "$PIP_CACHE_PURGE" = true ] ; then - export PATH=$PATH:/opt/conda/bin +if [ -d "/opt/intel" ]; then + # Intel GPU: If the directory exists, we assume we are using the Intel image + # https://github.com/intel/intel-extension-for-pytorch/issues/538 + pip install torch==2.1.0a0 \ + torchvision==0.16.0a0 \ + torchaudio==2.1.0a0 \ + intel-extension-for-pytorch==2.1.10+xpu \ + --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + + pip install google-api-python-client \ + grpcio \ + grpcio-tools \ + diffusers==0.24.0 \ + transformers>=4.25.1 \ + accelerate \ + compel==2.0.2 \ + Pillow +fi - # Activate conda environment - source activate diffusers +if [ "$PIP_CACHE_PURGE" = true ] ; then + if [ $SKIP_CONDA -ne 1 ]; then + # Activate conda environment + source activate diffusers + fi pip cache purge fi \ No newline at end of file diff --git a/backend/python/diffusers/run.sh b/backend/python/diffusers/run.sh index 8e3e1bbfbfdd..69b25d507a62 100755 --- a/backend/python/diffusers/run.sh +++ b/backend/python/diffusers/run.sh @@ -3,10 +3,15 @@ ## ## A bash script wrapper that runs the diffusers server with conda -export PATH=$PATH:/opt/conda/bin - -# Activate conda environment -source activate diffusers +if [ -d "/opt/intel" ]; then + # Assumes we are using the Intel oneAPI container image + # https://github.com/intel/intel-extension-for-pytorch/issues/538 + export XPU=1 +else + export PATH=$PATH:/opt/conda/bin + # Activate conda environment + source activate diffusers +fi # get the directory where the bash script is located DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" diff --git a/backend/python/exllama/install.sh b/backend/python/exllama/install.sh index 702bb1fbefb4..320e7f4dfac7 100755 --- a/backend/python/exllama/install.sh +++ b/backend/python/exllama/install.sh @@ -3,6 +3,11 @@ set -ex export PATH=$PATH:/opt/conda/bin +if [ "$BUILD_TYPE" != "cublas" ]; then + echo "[exllama] Attention!!! Nvidia GPU is required - skipping installation" + exit 0 +fi + # Check if environment exist conda_env_exists(){ ! conda list --name "${@}" >/dev/null 2>/dev/null diff --git a/backend/python/exllama2/install.sh b/backend/python/exllama2/install.sh index a6df3d37630b..858685b07eec 100755 --- a/backend/python/exllama2/install.sh +++ b/backend/python/exllama2/install.sh @@ -2,10 +2,14 @@ set -e ## ## A bash script installs the required dependencies of VALL-E-X and prepares the environment -export PATH=$PATH:/opt/conda/bin export SHA=c0ddebaaaf8ffd1b3529c2bb654e650bce2f790f -# Activate conda environment +if [ "$BUILD_TYPE" != "cublas" ]; then + echo "[exllamav2] Attention!!! Nvidia GPU is required - skipping installation" + exit 0 +fi + +export PATH=$PATH:/opt/conda/bin source activate transformers echo $CONDA_PREFIX diff --git a/backend/python/mamba/install.sh b/backend/python/mamba/install.sh index e56b83c2d31a..4ef26ece1c30 100755 --- a/backend/python/mamba/install.sh +++ b/backend/python/mamba/install.sh @@ -2,13 +2,14 @@ set -e ## ## A bash script installs the required dependencies of VALL-E-X and prepares the environment -export PATH=$PATH:/opt/conda/bin if [ "$BUILD_TYPE" != "cublas" ]; then echo "[mamba] Attention!!! nvcc is required - skipping installation" exit 0 fi +export PATH=$PATH:/opt/conda/bin + # Activate conda environment source activate transformers diff --git a/backend/python/petals/Makefile b/backend/python/petals/Makefile index 4bd07b112827..aa7778e15b29 100644 --- a/backend/python/petals/Makefile +++ b/backend/python/petals/Makefile @@ -1,7 +1,7 @@ .PHONY: petals petals: @echo "Creating virtual environment..." - @conda env create --name petals --file petals.yml + bash install.sh "petals.yml" @echo "Virtual environment created." .PHONY: run diff --git a/backend/python/petals/install.sh b/backend/python/petals/install.sh new file mode 100644 index 000000000000..97bcbb8af209 --- /dev/null +++ b/backend/python/petals/install.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +export PATH=$PATH:/opt/conda/bin + +conda env create --name petals --file $1 \ No newline at end of file diff --git a/backend/python/transformers/run.sh b/backend/python/transformers/run.sh index e6a42b7e1b6d..d09c1f5c0b47 100755 --- a/backend/python/transformers/run.sh +++ b/backend/python/transformers/run.sh @@ -3,10 +3,16 @@ ## ## A bash script wrapper that runs the transformers server with conda -export PATH=$PATH:/opt/conda/bin -# Activate conda environment -source activate transformers +if [ -d "/opt/intel" ]; then + # Assumes we are using the Intel oneAPI container image + # https://github.com/intel/intel-extension-for-pytorch/issues/538 + export XPU=1 +else + export PATH=$PATH:/opt/conda/bin + # Activate conda environment + source activate transformers +fi # get the directory where the bash script is located DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" diff --git a/backend/python/transformers/transformers_server.py b/backend/python/transformers/transformers_server.py index fe0b815a2226..41112c44f6e5 100755 --- a/backend/python/transformers/transformers_server.py +++ b/backend/python/transformers/transformers_server.py @@ -16,7 +16,15 @@ import grpc import torch import torch.cuda -from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed + +XPU=os.environ.get("XPU", "0") == "1" +if XPU: + import intel_extension_for_pytorch as ipex + from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM + from transformers import AutoTokenizer, AutoModel, set_seed +else: + from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed + _ONE_DAY_IN_SECONDS = 60 * 60 * 24 @@ -69,12 +77,25 @@ def LoadModel(self, request, context): model_name = request.Model try: if request.Type == "AutoModelForCausalLM": - self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode) + if XPU: + self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode, + device_map="xpu", load_in_4bit=True) + else: + self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode) else: self.model = AutoModel.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode) self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.CUDA = False + self.XPU = False + + if XPU: + self.XPU = True + try: + print("Optimizing model", model_name, "to XPU.", file=sys.stderr) + self.model = ipex.optimize_transformers(self.model, inplace=True, dtype=torch.float16, device="xpu") + except Exception as err: + print("Not using XPU:", err, file=sys.stderr) if request.CUDA or torch.cuda.is_available(): try: @@ -139,6 +160,8 @@ def Predict(self, request, context): inputs = self.tokenizer(request.Prompt, return_tensors="pt").input_ids if self.CUDA: inputs = inputs.to("cuda") + if XPU: + inputs = inputs.to("xpu") outputs = self.model.generate(inputs,max_new_tokens=max_tokens, temperature=request.Temperature, top_p=request.TopP) diff --git a/backend/python/vall-e-x/Makefile b/backend/python/vall-e-x/Makefile index 4804f12ff521..8f34f559ee3a 100644 --- a/backend/python/vall-e-x/Makefile +++ b/backend/python/vall-e-x/Makefile @@ -1,3 +1,7 @@ +ifneq (,$(findstring sycl,$(BUILD_TYPE))) +export SKIP_CONDA=1 +endif + .PHONY: ttsvalle ttsvalle: $(MAKE) -C ../common-env/transformers diff --git a/backend/python/vall-e-x/install.sh b/backend/python/vall-e-x/install.sh index 26ccdccd0789..a9c4117e5cc0 100644 --- a/backend/python/vall-e-x/install.sh +++ b/backend/python/vall-e-x/install.sh @@ -2,13 +2,16 @@ ## ## A bash script installs the required dependencies of VALL-E-X and prepares the environment -export PATH=$PATH:/opt/conda/bin export SHA=3faaf8ccadb154d63b38070caf518ce9309ea0f4 -# Activate conda environment -source activate transformers +SKIP_CONDA=${SKIP_CONDA:-0} -echo $CONDA_PREFIX +if [ $SKIP_CONDA -ne 1 ]; then + source activate transformers +else + export PATH=$PATH:/opt/conda/bin + CONDA_PREFIX=$PWD +fi git clone https://github.com/Plachtaa/VALL-E-X.git $CONDA_PREFIX/vall-e-x && pushd $CONDA_PREFIX/vall-e-x && git checkout -b build $SHA && popd diff --git a/core/backend/image.go b/core/backend/image.go index 60db48f96ba1..79b8d4ba15c4 100644 --- a/core/backend/image.go +++ b/core/backend/image.go @@ -8,27 +8,18 @@ import ( ) func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) { - + threads := backendConfig.Threads + if threads == 0 && appConfig.Threads != 0 { + threads = appConfig.Threads + } + gRPCOpts := gRPCModelOpts(backendConfig) opts := modelOpts(backendConfig, appConfig, []model.Option{ model.WithBackendString(backendConfig.Backend), model.WithAssetDir(appConfig.AssetsDestination), - model.WithThreads(uint32(backendConfig.Threads)), + model.WithThreads(uint32(threads)), model.WithContext(appConfig.Context), model.WithModel(backendConfig.Model), - model.WithLoadGRPCLoadModelOpts(&proto.ModelOptions{ - CUDA: backendConfig.CUDA || backendConfig.Diffusers.CUDA, - SchedulerType: backendConfig.Diffusers.SchedulerType, - PipelineType: backendConfig.Diffusers.PipelineType, - CFGScale: backendConfig.Diffusers.CFGScale, - LoraAdapter: backendConfig.LoraAdapter, - LoraScale: backendConfig.LoraScale, - LoraBase: backendConfig.LoraBase, - IMG2IMG: backendConfig.Diffusers.IMG2IMG, - CLIPModel: backendConfig.Diffusers.ClipModel, - CLIPSubfolder: backendConfig.Diffusers.ClipSubFolder, - CLIPSkip: int32(backendConfig.Diffusers.ClipSkip), - ControlNet: backendConfig.Diffusers.ControlNet, - }), + model.WithLoadGRPCLoadModelOpts(gRPCOpts), }) inferenceModel, err := loader.BackendLoader( diff --git a/core/backend/llm.go b/core/backend/llm.go index f16878c0f588..54e261889b8c 100644 --- a/core/backend/llm.go +++ b/core/backend/llm.go @@ -28,7 +28,10 @@ type TokenUsage struct { func ModelInference(ctx context.Context, s string, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) { modelFile := c.Model - + threads := c.Threads + if threads == 0 && o.Threads != 0 { + threads = o.Threads + } grpcOpts := gRPCModelOpts(c) var inferenceModel grpc.Backend @@ -36,7 +39,7 @@ func ModelInference(ctx context.Context, s string, images []string, loader *mode opts := modelOpts(c, o, []model.Option{ model.WithLoadGRPCLoadModelOpts(grpcOpts), - model.WithThreads(uint32(c.Threads)), // some models uses this to allocate threads during startup + model.WithThreads(uint32(threads)), // some models uses this to allocate threads during startup model.WithAssetDir(o.AssetsDestination), model.WithModel(modelFile), model.WithContext(o.Context), diff --git a/core/backend/options.go b/core/backend/options.go index d2bbb2b88e6d..3af6f6797a67 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -40,11 +40,23 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions { } return &pb.ModelOptions{ + CUDA: c.CUDA || c.Diffusers.CUDA, + SchedulerType: c.Diffusers.SchedulerType, + PipelineType: c.Diffusers.PipelineType, + CFGScale: c.Diffusers.CFGScale, + LoraAdapter: c.LoraAdapter, + LoraScale: c.LoraScale, + F16Memory: c.F16, + LoraBase: c.LoraBase, + IMG2IMG: c.Diffusers.IMG2IMG, + CLIPModel: c.Diffusers.ClipModel, + CLIPSubfolder: c.Diffusers.ClipSubFolder, + CLIPSkip: int32(c.Diffusers.ClipSkip), + ControlNet: c.Diffusers.ControlNet, ContextSize: int32(c.ContextSize), Seed: int32(c.Seed), NBatch: int32(b), NoMulMatQ: c.NoMulMatQ, - CUDA: c.CUDA, // diffusers, transformers DraftModel: c.DraftModel, AudioPath: c.VallE.AudioPath, Quantization: c.Quantization, @@ -58,12 +70,8 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions { YarnAttnFactor: c.YarnAttnFactor, YarnBetaFast: c.YarnBetaFast, YarnBetaSlow: c.YarnBetaSlow, - LoraAdapter: c.LoraAdapter, - LoraBase: c.LoraBase, - LoraScale: c.LoraScale, NGQA: c.NGQA, RMSNormEps: c.RMSNormEps, - F16Memory: c.F16, MLock: c.MMlock, RopeFreqBase: c.RopeFreqBase, RopeScaling: c.RopeScaling, diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go index fce44fe15469..1e2af8f9d287 100644 --- a/pkg/model/initializers.go +++ b/pkg/model/initializers.go @@ -69,6 +69,13 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string return fmt.Sprintf("127.0.0.1:%d", port), nil } + // If no specific model path is set for transformers/HF, set it to the model path + for _, env := range []string{"HF_HOME", "TRANSFORMERS_CACHE", "HUGGINGFACE_HUB_CACHE"} { + if os.Getenv(env) == "" { + os.Setenv(env, ml.ModelPath) + } + } + // Check if the backend is provided as external if uri, ok := o.externalBackends[backend]; ok { log.Debug().Msgf("Loading external backend: %s", uri)