From 74b98fdd99aa1fcfc427555db0bf09125b2720a5 Mon Sep 17 00:00:00 2001 From: Alex Cheema Date: Tue, 19 Nov 2024 15:34:43 +0400 Subject: [PATCH 1/3] update package versions to work on python >= 3.9 --- setup.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index b84d8f66a..fdcb4c1e5 100644 --- a/setup.py +++ b/setup.py @@ -8,8 +8,8 @@ "aiohttp==3.10.2", "aiohttp_cors==0.7.0", "aiofiles==24.1.0", - "grpcio==1.64.1", - "grpcio-tools==1.64.1", + "grpcio==1.68.0", + "grpcio-tools==1.68.0", "Jinja2==3.1.4", "netifaces==0.11.0", "numpy==2.0.0", @@ -21,10 +21,9 @@ "pydantic==2.9.2", "requests==2.32.3", "rich==13.7.1", - "safetensors==0.4.3", "tenacity==9.0.0", "tqdm==4.66.4", - "transformers==4.43.3", + "transformers==4.46.3", "uuid==1.30", "tinygrad @ git+https://github.com/tinygrad/tinygrad.git@232edcfd4f8b388807c64fb1817a7668ce27cbad", ] From 4ece73423ee48c9a9cf649d377c3fa7dddb0c43a Mon Sep 17 00:00:00 2001 From: Alex Cheema Date: Tue, 19 Nov 2024 15:47:12 +0400 Subject: [PATCH 2/3] always run tinygrad stuff on same thread. tricky because of lazy evaluation --- exo/inference/tinygrad/inference.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/exo/inference/tinygrad/inference.py b/exo/inference/tinygrad/inference.py index a7b331a05..c256922ea 100644 --- a/exo/inference/tinygrad/inference.py +++ b/exo/inference/tinygrad/inference.py @@ -7,7 +7,6 @@ from tinygrad.nn.state import load_state_dict from tinygrad import Tensor, nn, Context from exo.inference.inference_engine import InferenceEngine -from typing import Optional, Tuple import numpy as np from exo.inference.tinygrad.tinygrad_helpers import concat_weights, load from exo.download.shard_download import ShardDownloader @@ -68,24 +67,21 @@ def __init__(self, shard_downloader: ShardDownloader): async def sample(self, x: np.ndarray, temp=TEMPERATURE, top_p: float = 0.0) -> np.ndarray: logits = x[:, -1, :] def sample_wrapper(): - return sample_logits(Tensor(logits).flatten(), temp, 0, 0.8, top_p, 0.0).realize() - out = await asyncio.get_running_loop().run_in_executor(self.executor, sample_wrapper) - return out.numpy().astype(int) + return sample_logits(Tensor(logits).flatten(), temp, 0, 0.8, top_p, 0.0).realize().numpy().astype(int) + return await asyncio.get_running_loop().run_in_executor(self.executor, sample_wrapper) async def encode(self, shard: Shard, prompt: str) -> np.ndarray: await self.ensure_shard(shard) tokens = await asyncio.get_running_loop().run_in_executor(self.executor, self.tokenizer.encode, prompt) - return np.array(tokens) + return await asyncio.get_running_loop().run_in_executor(self.executor, np.array, tokens) async def decode(self, shard: Shard, tokens) -> str: await self.ensure_shard(shard) - tokens = await asyncio.get_running_loop().run_in_executor(self.executor, self.tokenizer.decode, tokens) - return tokens + return await asyncio.get_running_loop().run_in_executor(self.executor, self.tokenizer.decode, tokens) async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray) -> np.ndarray: await self.ensure_shard(shard) - output_data = await asyncio.get_running_loop().run_in_executor(self.executor, lambda: self.model(Tensor(input_data), request_id).realize()) - return output_data.numpy() + return await asyncio.get_running_loop().run_in_executor(self.executor, lambda: self.model(Tensor(input_data), request_id).realize().numpy()) async def ensure_shard(self, shard: Shard): if self.shard == shard: From 312602fa13f89a2199c79154996109679dbebebd Mon Sep 17 00:00:00 2001 From: Alex Cheema Date: Tue, 19 Nov 2024 15:52:21 +0400 Subject: [PATCH 3/3] fix shard_specific_patterns --- exo/download/hf/hf_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exo/download/hf/hf_helpers.py b/exo/download/hf/hf_helpers.py index 4729e5f61..37d4d04bc 100644 --- a/exo/download/hf/hf_helpers.py +++ b/exo/download/hf/hf_helpers.py @@ -409,7 +409,7 @@ def get_allow_patterns(weight_map: Dict[str, str], shard: Shard) -> List[str]: elif shard.is_last_layer(): shard_specific_patterns.add(sorted_file_names[-1]) else: - shard_specific_patterns = set("*.safetensors") + shard_specific_patterns = set(["*.safetensors"]) if DEBUG >= 2: print(f"get_allow_patterns {weight_map=} {shard=} {shard_specific_patterns=}") return list(default_patterns | shard_specific_patterns)