Merge branch 'master' into feat-musicgen-duration

Signed-off-by: Ettore Di Giacinto <[email protected]>
mudler · Aug 23, 2024 · 5322067 · 5322067
2 parents 62a9386 + ac5f6f2
commit 5322067
Show file tree

Hide file tree

Showing 28 changed files with 214 additions and 72 deletions.
diff --git a/.devcontainer-scripts/utils.sh b/.devcontainer-scripts/utils.sh
@@ -35,8 +35,9 @@ config_remote() {
 #
 # Param 1: bash array, filenames relative to the customization directory that should be copied to ~/.ssh
 setup_ssh() {
+    mkdir -p ~/.ssh
     local files=("$@")
-    for file in "${files[@]}"; then
+    for file in "${files[@]}" ; do
         local cfile="/devcontainer-customization/${file}"
         local hfile="~/.ssh/${file}"
         if [ ! -f "${hfile}" ]; then

diff --git a/Makefile b/Makefile
@@ -8,15 +8,15 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=fc54ef0d1c138133a01933296d50a36a1ab64735
+CPPLLAMA_VERSION?=3ba780e2a8f0ffe13f571b27f0bbf2ca5a199efc
 
 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
 RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
-WHISPER_CPP_VERSION?=d65786ea540a5aef21f67cacfa6f134097727780
+WHISPER_CPP_VERSION?=9e3c5345cd46ea718209db53464e426c3fe7a25e
 
 # bert.cpp version
 BERT_REPO?=https://github.com/go-skynet/go-bert.cpp

diff --git a/backend/python/autogptq/requirements.txt b/backend/python/autogptq/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 auto-gptq==0.7.1
-grpcio==1.65.4
+grpcio==1.66.0
 protobuf
 certifi
 transformers
diff --git a/backend/python/bark/requirements.txt b/backend/python/bark/requirements.txt
@@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.65.5
+grpcio==1.66.0
 protobuf
 certifi
diff --git a/backend/python/common/template/requirements.txt b/backend/python/common/template/requirements.txt
@@ -1,2 +1,2 @@
-grpcio==1.65.5
+grpcio==1.66.0
 protobuf
diff --git a/backend/python/coqui/requirements.txt b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
 TTS==0.22.0
-grpcio==1.65.5
+grpcio==1.66.0
 protobuf
 certifi
diff --git a/backend/python/diffusers/requirements.txt b/backend/python/diffusers/requirements.txt
@@ -1,5 +1,5 @@
 setuptools
-grpcio==1.65.4
+grpcio==1.66.0
 pillow
 protobuf
 certifi
diff --git a/backend/python/exllama/requirements.txt b/backend/python/exllama/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.65.5
+grpcio==1.66.0
 protobuf
 certifi
 setuptools
diff --git a/backend/python/exllama2/requirements.txt b/backend/python/exllama2/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.65.4
+grpcio==1.66.0
 protobuf
 certifi
 wheel

diff --git a/backend/python/mamba/requirements.txt b/backend/python/mamba/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.65.5
+grpcio==1.66.0
 protobuf
 certifi
diff --git a/backend/python/openvoice/requirements-intel.txt b/backend/python/openvoice/requirements-intel.txt
@@ -2,7 +2,7 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-grpcio==1.65.5
+grpcio==1.66.0
 protobuf
 librosa==0.9.1
 faster-whisper==1.0.3

diff --git a/backend/python/openvoice/requirements.txt b/backend/python/openvoice/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.65.5
+grpcio==1.66.0
 protobuf
 librosa
 faster-whisper

diff --git a/backend/python/parler-tts/requirements.txt b/backend/python/parler-tts/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.65.5
+grpcio==1.66.0
 protobuf
 certifi
 llvmlite==0.43.0
diff --git a/backend/python/rerankers/requirements.txt b/backend/python/rerankers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.65.4
+grpcio==1.66.0
 protobuf
 certifi
diff --git a/backend/python/sentencetransformers/requirements.txt b/backend/python/sentencetransformers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.65.5
+grpcio==1.66.0
 protobuf
 certifi
diff --git a/backend/python/transformers-musicgen/requirements.txt b/backend/python/transformers-musicgen/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.65.5
+grpcio==1.66.0
 protobuf
 scipy==1.14.0
 certifi
diff --git a/backend/python/transformers/requirements.txt b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.65.5
+grpcio==1.66.0
 protobuf
 certifi
 setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
diff --git a/backend/python/vall-e-x/requirements.txt b/backend/python/vall-e-x/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.65.5
+grpcio==1.66.0
 protobuf
 certifi
diff --git a/backend/python/vllm/requirements.txt b/backend/python/vllm/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.65.5
+grpcio==1.66.0
 protobuf
 certifi
 setuptools
diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
@@ -25,9 +25,8 @@ import (
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/chat/completions [post]
 func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startupOptions *config.ApplicationConfig) func(c *fiber.Ctx) error {
-	textContentToReturn := ""
-	id := uuid.New().String()
-	created := int(time.Now().Unix())
+	var id, textContentToReturn string
+	var created int
 
 	process := func(s string, req *schema.OpenAIRequest, config *config.BackendConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
 		initialMessage := schema.OpenAIResponse{
@@ -159,6 +158,10 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 	}
 
 	return func(c *fiber.Ctx) error {
+		textContentToReturn = ""
+		id = uuid.New().String()
+		created = int(time.Now().Unix())
+
 		modelFile, input, err := readRequest(c, cl, ml, startupOptions, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)

diff --git a/docs/data/version.json b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v2.19.4"
+  "version": "v2.20.1"
 }
diff --git a/gallery/hermes-vllm.yaml b/gallery/hermes-vllm.yaml
@@ -0,0 +1,91 @@
+---
+name: "hermes-vllm"
+
+config_file: |
+    backend: vllm
+    context_size: 8192
+    stopwords:
+    - "<|im_end|>"
+    - "<dummy32000>"
+    - "<|eot_id|>"
+    - "<|end_of_text|>"
+    function:
+      disable_no_action: true
+      grammar:
+        # Uncomment the line below to enable grammar matching for JSON results if the model is breaking
+        # the output. This will make the model more accurate and won't break the JSON output.
+        # This however, will make parallel_calls not functional (it is a known bug)
+        # mixed_mode: true
+        disable: true
+        parallel_calls: true
+        expect_strings_after_json: true
+      json_regex_match:
+      - "(?s)<tool_call>(.*?)</tool_call>"
+      - "(?s)<tool_call>(.*)"
+      capture_llm_results:
+        - (?s)<scratchpad>(.*?)</scratchpad>
+      replace_llm_results:
+        - key: (?s)<scratchpad>(.*?)</scratchpad>
+          value: ""
+
+    template:
+      use_tokenizer_template: true
+      chat: |
+        {{.Input -}}
+        <|im_start|>assistant
+      chat_message: |
+        <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
+        {{- if .FunctionCall }}
+        <tool_call>
+        {{- else if eq .RoleName "tool" }}
+        <tool_response>
+        {{- end }}
+        {{- if .Content}}
+        {{.Content }}
+        {{- end }}
+        {{- if .FunctionCall}}
+        {{toJson .FunctionCall}}
+        {{- end }}
+        {{- if .FunctionCall }}
+        </tool_call>
+        {{- else if eq .RoleName "tool" }}
+        </tool_response>
+        {{- end }}<|im_end|>
+      completion: |
+        {{.Input}}
+      function: |
+        <|im_start|>system
+        You are a function calling AI model.
+        Here are the available tools:
+        <tools>
+        {{range .Functions}}
+        {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+        {{end}}
+        </tools>
+        You should call the tools provided to you sequentially
+        Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
+        <scratchpad>
+        {step-by-step reasoning and plan in bullet points}
+        </scratchpad>
+        For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
+        <tool_call>
+        {"arguments": <args-dict>, "name": <function-name>}
+        </tool_call><|im_end|>
+        {{.Input -}}
+        <|im_start|>assistant
+# Uncomment to specify a quantization method (optional)
+# quantization: "awq"
+# Uncomment to limit the GPU memory utilization (vLLM default is 0.9 for 90%)
+# gpu_memory_utilization: 0.5
+# Uncomment to trust remote code from huggingface
+# trust_remote_code: true
+# Uncomment to enable eager execution
+# enforce_eager: true
+# Uncomment to specify the size of the CPU swap space per GPU (in GiB)
+# swap_space: 2
+# Uncomment to specify the maximum length of a sequence (including prompt and output)
+# max_model_len: 32768
+# Uncomment and specify the number of Tensor divisions.
+# Allows you to partition and run large models. Performance gains are limited.
+# https://github.com/vllm-project/vllm/issues/1435
+# tensor_parallel_size: 2
diff --git a/gallery/index.yaml b/gallery/index.yaml
@@ -4752,6 +4752,38 @@
     - filename: Hermes-3-Llama-3.1-70B.Q4_K_M.gguf
       sha256: 955c2f42caade4278f3c9dbffa32bb74572652b20e49e5340e782de3585bbe3f
       uri: huggingface://NousResearch/Hermes-3-Llama-3.1-70B-GGUF/Hermes-3-Llama-3.1-70B.Q4_K_M.gguf
+- &hermes-vllm
+  url: "github:mudler/LocalAI/gallery/hermes-vllm.yaml@master"
+  name: "hermes-3-llama-3.1-8b:vllm"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/6317aade83d8d2fd903192d9/vG6j5WxHX09yj32vgjJlI.jpeg
+  tags:
+    - llm
+    - vllm
+    - gpu
+    - function-calling
+  license: llama-3
+  urls:
+    - https://huggingface.co/NousResearch/Hermes-3-Llama-3.1-8B
+  description: |
+    Hermes 3 is a generalist language model with many improvements over Hermes 2, including advanced agentic capabilities, much better roleplaying, reasoning, multi-turn conversation, long context coherence, and improvements across the board. It is designed to focus on aligning LLMs to the user, with powerful steering capabilities and control given to the end user. The model uses ChatML as the prompt format, opening up a much more structured system for engaging the LLM in multi-turn chat dialogue. It also supports function calling and structured output capabilities, generalist assistant capabilities, and improved code generation skills.
+  overrides:
+    parameters:
+      model: NousResearch/Hermes-3-Llama-3.1-8B
+- !!merge <<: *hermes-vllm
+  name: "hermes-3-llama-3.1-70b:vllm"
+  urls:
+    - https://huggingface.co/NousResearch/Hermes-3-Llama-3.1-70B
+  overrides:
+    parameters:
+      model: NousResearch/Hermes-3-Llama-3.1-70B
+- !!merge <<: *hermes-vllm
+  name: "hermes-3-llama-3.1-405b:vllm"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/6317aade83d8d2fd903192d9/-kj_KflXsdpcZoTQsvx7W.jpeg
+  urls:
+    - https://huggingface.co/NousResearch/Hermes-3-Llama-3.1-405B
+  overrides:
+    parameters:
+      model: NousResearch/Hermes-3-Llama-3.1-405B
 - !!merge <<: *hermes-2-pro-mistral
   name: "biomistral-7b"
   description: |

diff --git a/gallery/vllm.yaml b/gallery/vllm.yaml
@@ -0,0 +1,29 @@
+---
+name: "vllm"
+
+config_file: |
+    backend: vllm
+    function:
+      disable_no_action: true
+      grammar:
+        disable: true
+        parallel_calls: true
+        expect_strings_after_json: true
+    template:
+      use_tokenizer_template: true
+    # Uncomment to specify a quantization method (optional)
+    # quantization: "awq"
+    # Uncomment to limit the GPU memory utilization (vLLM default is 0.9 for 90%)
+    # gpu_memory_utilization: 0.5
+    # Uncomment to trust remote code from huggingface
+    # trust_remote_code: true
+    # Uncomment to enable eager execution
+    # enforce_eager: true
+    # Uncomment to specify the size of the CPU swap space per GPU (in GiB)
+    # swap_space: 2
+    # Uncomment to specify the maximum length of a sequence (including prompt and output)
+    # max_model_len: 32768
+    # Uncomment and specify the number of Tensor divisions.
+    # Allows you to partition and run large models. Performance gains are limited.
+    # https://github.com/vllm-project/vllm/issues/1435
+    # tensor_parallel_size: 2
diff --git a/go.mod b/go.mod
@@ -33,8 +33,8 @@ require (
 	github.com/libp2p/go-libp2p v0.36.2
 	github.com/mholt/archiver/v3 v3.5.1
 	github.com/microcosm-cc/bluemonday v1.0.26
+	github.com/mudler/go-processmanager v0.0.0-20240820160718-8b802d3ecf82
 	github.com/mudler/edgevpn v0.27.3
-	github.com/mudler/go-processmanager v0.0.0-20230818213616-f204007f963c
 	github.com/mudler/go-stable-diffusion v0.0.0-20240429204715-4a3cd6aeae6f
 	github.com/onsi/ginkgo/v2 v2.20.0
 	github.com/onsi/gomega v1.34.1
@@ -84,6 +84,7 @@ require (
 	github.com/pion/transport/v2 v2.2.10 // indirect
 	github.com/pion/turn/v2 v2.1.6 // indirect
 	github.com/pion/webrtc/v3 v3.3.0 // indirect
+	github.com/shirou/gopsutil/v4 v4.24.7 // indirect
 	github.com/wlynxg/anet v0.0.4 // indirect
 	go.uber.org/mock v0.4.0 // indirect
 )
@@ -132,7 +133,7 @@ require (
 	github.com/go-audio/riff v1.0.0 // indirect
 	github.com/go-logr/logr v1.4.2 // indirect
 	github.com/go-logr/stdr v1.2.2 // indirect
-	github.com/go-ole/go-ole v1.2.6 // indirect
+	github.com/go-ole/go-ole v1.3.0 // indirect
 	github.com/go-openapi/jsonpointer v0.21.0 // indirect
 	github.com/go-openapi/jsonreference v0.21.0 // indirect
 	github.com/go-openapi/spec v0.21.0 // indirect
@@ -188,7 +189,7 @@ require (
 	github.com/libp2p/go-yamux/v4 v4.0.1 // indirect
 	github.com/libp2p/zeroconf/v2 v2.2.0 // indirect
 	github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
-	github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
+	github.com/lufia/plan9stats v0.0.0-20240819163618-b1d8f4d146e7 // indirect
 	github.com/mailru/easyjson v0.7.7 // indirect
 	github.com/marten-seemann/tcp v0.0.0-20210406111302-dfbc87cc63fd // indirect
 	github.com/mattn/go-colorable v0.1.13 // indirect
@@ -234,7 +235,7 @@ require (
 	github.com/pkoukk/tiktoken-go v0.1.6 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
 	github.com/polydawn/refmt v0.89.0 // indirect
-	github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect
+	github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect
 	github.com/prometheus/client_model v0.6.1 // indirect
 	github.com/prometheus/common v0.55.0 // indirect
 	github.com/prometheus/procfs v0.15.1 // indirect
@@ -252,8 +253,8 @@ require (
 	github.com/spf13/cast v1.5.0 // indirect
 	github.com/swaggo/files/v2 v2.0.0 // indirect
 	github.com/tinylib/msgp v1.1.8 // indirect
-	github.com/tklauser/go-sysconf v0.3.12 // indirect
-	github.com/tklauser/numcpus v0.6.1 // indirect
+	github.com/tklauser/go-sysconf v0.3.14 // indirect
+	github.com/tklauser/numcpus v0.8.0 // indirect
 	github.com/ulikunitz/xz v0.5.9 // indirect
 	github.com/valyala/bytebufferpool v1.0.0 // indirect
 	github.com/valyala/tcplisten v1.0.0 // indirect