1.7.0 release updates (#454)

mitya52 · web-flow · commit 541193670ea4 · 2024-09-12T11:08:05.000+01:00
* remove deprecated models

* add new openai models

* up passthrough limit

* up litellm version

* up refact version to 1.7.0

* upgrade cython
diff --git a/Dockerfile b/Dockerfile
@@ -44,6 +44,7 @@ RUN cd /tmp/refact-lsp \
 COPY . /tmp/app
 RUN echo "refact $(git -C /tmp/app rev-parse HEAD)" >> /refact-build-info.txt
 RUN pip install ninja
+RUN pip install -U cython
 RUN pip install /tmp/app -v --no-build-isolation && rm -rf /tmp/app
 
 ENV REFACT_PERM_DIR "/perm_storage"
diff --git a/README.md b/README.md
@@ -106,22 +106,9 @@ Extensions > Refact.ai Assistant > Settings > Infurl
 | Model                                                                                             | Completion | Chat | Fine-tuning | [Deprecated](## "Will be removed in next versions") |
 |---------------------------------------------------------------------------------------------------|------------|------|-------------|-----------------------------------------------------|
 | [Refact/1.6B](https://huggingface.co/smallcloudai/Refact-1_6B-fim)                                | +          |      | +           |                                                     |
-| [starcoder/1b/base](https://huggingface.co/smallcloudai/starcoderbase-1b)                         | +          |      | +           | +                                                   |
-| [starcoder/3b/base](https://huggingface.co/smallcloudai/starcoderbase-3b)                         | +          |      | +           | +                                                   |
-| [starcoder/7b/base](https://huggingface.co/smallcloudai/starcoderbase-7b)                         | +          |      | +           | +                                                   |
-| [starcoder/15b/base](https://huggingface.co/TheBloke/starcoder-GPTQ)                              | +          |      |             | +                                                   |
-| [starcoder/15b/plus](https://huggingface.co/TheBloke/starcoderplus-GPTQ)                          | +          |      |             | +                                                   |
 | [starcoder2/3b/base](https://huggingface.co/bigcode/starcoder2-3b)                                | +          |      | +           |                                                     |
 | [starcoder2/7b/base](https://huggingface.co/bigcode/starcoder2-7b)                                | +          |      | +           |                                                     |
 | [starcoder2/15b/base](https://huggingface.co/bigcode/starcoder2-15b)                              | +          |      | +           |                                                     |
-| [wizardcoder/15b](https://huggingface.co/TheBloke/WizardCoder-15B-1.0-GPTQ)                       | +          |      |             | +                                                   |
-| [codellama/7b](https://huggingface.co/TheBloke/CodeLlama-7B-fp16)                                 | +          |      | +           | +                                                   |
-| [starchat/15b/beta](https://huggingface.co/TheBloke/starchat-beta-GPTQ)                           |            | +    |             | +                                                   |
-| [wizardlm/7b](https://huggingface.co/TheBloke/WizardLM-7B-V1.0-Uncensored-GPTQ)                   |            | +    |             | +                                                   |
-| [wizardlm/13b](https://huggingface.co/TheBloke/WizardLM-13B-V1.1-GPTQ)                            |            | +    |             | +                                                   |
-| [wizardlm/30b](https://huggingface.co/TheBloke/WizardLM-30B-fp16)                                 |            | +    |             | +                                                   |
-| [llama2/7b](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ)                                 |            | +    |             | +                                                   |
-| [llama2/13b](https://huggingface.co/TheBloke/Llama-2-13B-chat-GPTQ)                               |            | +    |             | +                                                   |
 | [deepseek-coder/1.3b/base](https://huggingface.co/deepseek-ai/deepseek-coder-1.3b-base)           | +          |      | +           |                                                     |
 | [deepseek-coder/5.7b/mqa-base](https://huggingface.co/deepseek-ai/deepseek-coder-5.7bmqa-base)    | +          |      | +           |                                                     |
 | [magicoder/6.7b](https://huggingface.co/TheBloke/Magicoder-S-DS-6.7B-GPTQ)                        |            | +    |             |                                                     |
diff --git a/refact_known_models/huggingface.py b/refact_known_models/huggingface.py
@@ -1,122 +1,4 @@
 huggingface_mini_db = {
-    "starcoder/15b/base": {
-        "backend": "autogptq",
-        "model_path": "TheBloke/starcoder-GPTQ",
-        "model_class_kwargs": {},
-        "required_memory_mb": 18000,
-        "T": 4096,
-        "filter_caps": ["completion"],
-        "deprecated": True,
-    },
-    "starcoder/15b/plus": {
-        "backend": "autogptq",
-        "model_path": "TheBloke/starcoderplus-GPTQ",
-        "model_class_kwargs": {},
-        "required_memory_mb": 18000,
-        "T": 4096,
-        "filter_caps": ["completion"],
-        "deprecated": True,
-    },
-    "starchat/15b/beta": {
-        "backend": "autogptq",
-        "model_path": "TheBloke/starchat-beta-GPTQ",
-        "model_class_kwargs": {},
-        "required_memory_mb": 18000,
-        "T": 4096,
-        "filter_caps": ["chat"],
-        "deprecated": True,
-    },
-    "starcoder/1b/base": {
-        "backend": "transformers",
-        "model_path": "smallcloudai/starcoderbase-1b",
-        "model_class_kwargs": {},
-        "required_memory_mb": 8000,
-        "T": 8192,
-        "filter_caps": ["completion", "finetune"],
-        "deprecated": True,
-    },
-    "starcoder/3b/base": {
-        "backend": "transformers",
-        "model_path": "smallcloudai/starcoderbase-3b",
-        "model_class_kwargs": {},
-        "required_memory_mb": 12000,
-        "T": 4096,
-        "filter_caps": ["completion", "finetune"],
-        "deprecated": True,
-    },
-    "starcoder/7b/base": {
-        "backend": "transformers",
-        "model_path": "smallcloudai/starcoderbase-7b",
-        "model_class_kwargs": {},
-        "required_memory_mb": 20000,
-        "T": 4096,
-        "filter_caps": ["completion", "finetune"],
-        "deprecated": True,
-    },
-    "wizardcoder/15b": {
-        "backend": "autogptq",
-        "model_path": "TheBloke/WizardCoder-15B-1.0-GPTQ",
-        "model_class_kwargs": {},
-        "required_memory_mb": 18000,
-        "T": 4096,
-        "filter_caps": ["completion"],
-        "deprecated": True,
-    },
-    "wizardlm/7b": {
-        "backend": "autogptq",
-        "model_path": "TheBloke/WizardLM-7B-V1.0-Uncensored-GPTQ",
-        "model_class_kwargs": {},
-        "required_memory_mb": 8000,
-        "T": 2048,
-        "filter_caps": ["chat"],
-        "deprecated": True,
-    },
-    "wizardlm/13b": {
-        "backend": "autogptq",
-        "model_path": "TheBloke/WizardLM-13B-V1.1-GPTQ",
-        "model_class_kwargs": {},
-        "required_memory_mb": 14000,
-        "T": 2048,
-        "filter_caps": ["chat"],
-        "deprecated": True,
-    },
-    "llama2/7b": {
-        "backend": "autogptq",
-        "model_path": "TheBloke/Llama-2-7b-Chat-GPTQ",
-        "model_class_kwargs": {},
-        "required_memory_mb": 8000,
-        "T": 2048,
-        "filter_caps": ["chat"],
-        "deprecated": True,
-    },
-    "llama2/13b": {
-        "backend": "autogptq",
-        "model_path": "TheBloke/Llama-2-13B-chat-GPTQ",
-        "model_class_kwargs": {},
-        "required_memory_mb": 14000,
-        "T": 2048,
-        "filter_caps": ["chat"],
-        "deprecated": True,
-    },
-    "codellama/7b": {
-        "backend": "transformers",
-        "model_path": "TheBloke/CodeLlama-7B-fp16",
-        "model_class_kwargs": {},
-        "required_memory_mb": 14000,
-        "T": 2048,
-        "filter_caps": ["completion"],
-        "deprecated": True,
-    },
-    "wizardlm/30b": {
-        "backend": "transformers",
-        "model_path": "TheBloke/WizardLM-30B-fp16",
-        "model_class_kwargs": {
-            "load_in_4bit": True,
-        },
-        "T": 2048,
-        "filter_caps": ["chat"],
-        "deprecated": True,
-    },
     "deepseek-coder/1.3b/base": {
         "backend": "transformers",
         "model_path": "deepseek-ai/deepseek-coder-1.3b-base",
diff --git a/refact_known_models/passthrough.py b/refact_known_models/passthrough.py
@@ -89,4 +89,37 @@
         "pp1000t_generated": 15_000,
         "filter_caps": ["chat", "tools"],
     },
+    "gpt-4o-2024-05-13": {
+        "backend": "litellm",
+        "provider": "openai",
+        "tokenizer_path": "Xenova/gpt-4o",
+        "resolve_as": "gpt-4o-2024-05-13",
+        "T": 128_000,
+        "T_out": 4096,
+        "pp1000t_prompt": 5_000,
+        "pp1000t_generated": 15_000,  # $15.00 / 1M tokens
+        "filter_caps": ["chat", "tools"],
+    },
+    "gpt-4o-2024-08-06": {
+        "backend": "litellm",
+        "provider": "openai",
+        "tokenizer_path": "Xenova/gpt-4o",
+        "resolve_as": "gpt-4o-2024-08-06",
+        "T": 128_000,
+        "T_out": 4096,
+        "pp1000t_prompt": 2_500,
+        "pp1000t_generated": 10_000,  # $15.00 / 1M tokens
+        "filter_caps": ["chat", "tools"]
+    },
+    "gpt-4o-mini": {
+        "backend": "litellm",
+        "provider": "openai",
+        "tokenizer_path": "Xenova/gpt-4o",
+        "resolve_as": "gpt-4o-mini-2024-07-18",
+        "T": 128_000,
+        "T_out": 4096,
+        "pp1000t_prompt": 150,
+        "pp1000t_generated": 600,  # $0.60 / 1M tokens
+        "filter_caps": ["chat", "tools"],
+    },
 }
diff --git a/refact_webgui/webgui/selfhost_model_resolve.py b/refact_webgui/webgui/selfhost_model_resolve.py
@@ -39,7 +39,7 @@ def resolve_model_context_size(model_name: str, model_assigner: ModelAssigner) -
     if model_name in model_assigner.models_db:
         return model_assigner.model_assignment["model_assign"][model_name]["n_ctx"]
 
-    PASSTHROUGH_MAX_TOKENS_LIMIT = 16_000
+    PASSTHROUGH_MAX_TOKENS_LIMIT = 64_000
 
     if model_name in model_assigner.passthrough_mini_db:
         if max_tokens := model_assigner.passthrough_mini_db[model_name].get('T'):
diff --git a/setup.py b/setup.py
@@ -35,10 +35,7 @@ class PyPackage:
     "refact_webgui": PyPackage(
         requires=["aiohttp", "aiofiles", "cryptography", "fastapi==0.100.0", "giturlparse", "pydantic>=2",
                   "starlette==0.27.0", "uvicorn", "uvloop", "termcolor", "python-multipart", "more_itertools",
-                  "scyllapy==1.3.0", "pandas>=2.0.3",
-                  # NOTE: litellm has bug with anthropic streaming, so we're staying on this version for now
-                  "litellm==1.42.0",
-                  ],
+                  "scyllapy==1.3.0", "pandas>=2.0.3", "litellm>=1.44.24"],
         requires_packages=["refact_known_models", "refact_utils"],
         data=["webgui/static/*", "webgui/static/components/modals/*",
               "webgui/static/dashboards/*", "webgui/static/assets/*", "webgui/static/utils/*",]),
@@ -94,7 +91,7 @@ def get_install_requires(packages):
 
 setup(
     name="refact-self-hosting",
-    version="1.6.4",
+    version="1.7.0",
     py_modules=list(setup_packages.keys()),
     package_data={
         name: py_package.data