Skip to content

Pr139 dev #8

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 32 commits into from
Aug 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
226a0ac
removing unittest, update inference return type, fixing converting te…
risingsunomi Aug 25, 2024
e11bebd
adding nvidia quadro and t1000 support
risingsunomi Aug 25, 2024
778cb6e
updating test, updating model selection for smaller quant llama3 model
risingsunomi Aug 25, 2024
56aae50
added updating model options to update_deps.py
risingsunomi Aug 25, 2024
7df4640
updating inference class init to take shard, updating pytorch test_in…
risingsunomi Aug 25, 2024
aa769ca
adding updates for inference_engine.py
risingsunomi Aug 25, 2024
08e8b41
reducing layer amount for llama3-2b-base
risingsunomi Aug 25, 2024
dd2812b
fixing gpu tensor to numpy conversion issues, updating top_p_sampling…
risingsunomi Aug 25, 2024
7bcd35e
forward rewrite, adding in caching with dynamic cache, cache conversi…
risingsunomi Aug 26, 2024
3beea22
updates to caching, stuck on issue with infer_prompt and infer_tensor…
risingsunomi Aug 26, 2024
87a14ca
trying to fix infer problems
risingsunomi Aug 26, 2024
356bf2f
switched everything to use caching, did more prep for encoding the to…
risingsunomi Aug 26, 2024
aa89032
fixing test
risingsunomi Aug 26, 2024
b9331d7
adding init py for old python versions
risingsunomi Aug 26, 2024
2c7aa9c
update readme and add in init pys
risingsunomi Aug 26, 2024
6da3e94
adding more tests
risingsunomi Aug 26, 2024
d0bc93c
adding more try catch to move through tests
risingsunomi Aug 26, 2024
0e221b2
tests
risingsunomi Aug 26, 2024
9fc9fdb
added position embeddings, update test
risingsunomi Aug 26, 2024
2635b4c
tests
risingsunomi Aug 26, 2024
86e89eb
adding back tests
risingsunomi Aug 27, 2024
64fbacd
adding another test
risingsunomi Aug 27, 2024
fb7c73f
Merge pull request #6 from exo-explore/main
risingsunomi Aug 27, 2024
0d93130
added gc collect to remove gpu, fixed tokenizers warning
risingsunomi Aug 27, 2024
0ae716d
fixing device
risingsunomi Aug 27, 2024
7705639
adding smaller model test
risingsunomi Aug 27, 2024
81d597d
testing
risingsunomi Aug 28, 2024
f1d3e31
added tinyllama
risingsunomi Aug 28, 2024
bf0e606
changing top_p
risingsunomi Aug 28, 2024
432efb5
updating test
risingsunomi Aug 28, 2024
2cdc14c
adding A10, adding test
risingsunomi Aug 28, 2024
ed5bea7
removing reloading of shard, changing temp and top_p
risingsunomi Aug 28, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -170,3 +170,6 @@ cython_debug/
#.idea/

**/*.xcodeproj/*

# PyTorch interface
.offload
33 changes: 30 additions & 3 deletions exo/api/chatgpt_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,27 @@ def remap_messages(messages: List[Message]) -> List[Message]:


def build_prompt(tokenizer, _messages: List[Message]):
if len(_messages) == 1:
user_msg = _messages[0]

# get instruct sys message
sys_msg = Message(role="system", content="You are a helpful assistant.")

# restructure for sys_msg to go first
_messages = [sys_msg, user_msg]

messages = remap_messages(_messages)
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)

if DEBUG >= 3:
print(f"prompt: {str(prompt)}")
for msg in messages:
print(f"chat role: {msg.role}\ncontent: {msg.content}")

image_str = None
for message in messages:
if not isinstance(message.content, list):
Expand Down Expand Up @@ -168,8 +187,13 @@ def __init__(self, node: Node, inference_engine_classname: str, response_timeout
allow_headers="*",
allow_methods="*",
)
cors.add(self.app.router.add_post("/v1/chat/completions", self.handle_post_chat_completions), {"*": cors_options})
cors.add(self.app.router.add_get("/models", self.handle_get_models), {"*": cors_options})
cors.add(self.app.router.add_get("/v1/models", self.handle_get_models), {"*": cors_options})
cors.add(self.app.router.add_post("/chat/token/encode", self.handle_post_chat_token_encode), {"*": cors_options})
cors.add(self.app.router.add_post("/v1/chat/token/encode", self.handle_post_chat_token_encode), {"*": cors_options})
cors.add(self.app.router.add_post("/chat/completions", self.handle_post_chat_completions), {"*": cors_options})
cors.add(self.app.router.add_post("/v1/chat/completions", self.handle_post_chat_completions), {"*": cors_options})

self.static_dir = Path(__file__).parent.parent.parent/"tinychat/examples/tinychat"
self.app.router.add_get("/", self.handle_root)
self.app.router.add_static("/", self.static_dir, name="static")
Expand All @@ -187,6 +211,9 @@ async def middleware(request):
async def handle_root(self, request):
return web.FileResponse(self.static_dir/"index.html")

async def handle_get_models(self, request):
return web.json_response([{"id": model_name, "object": "model", "owned_by": "exo", "ready": True } for model_name, _ in model_base_shards.items()])

async def handle_post_chat_token_encode(self, request):
data = await request.json()
shard = model_base_shards.get(data.get("model", "llama-3.1-8b"), {}).get(self.inference_engine_classname)
Expand Down Expand Up @@ -253,7 +280,7 @@ async def handle_post_chat_completions(self, request):
status=200,
reason="OK",
headers={
"Content-Type": "application/json",
"Content-Type": "text/event-stream",
"Cache-Control": "no-cache",
},
)
Expand Down
Empty file added exo/download/__init__.py
Empty file.
13 changes: 3 additions & 10 deletions exo/download/download_progress.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,7 @@ def to_dict(self):

@classmethod
def from_dict(cls, data):
# Convert eta from seconds back to timedelta
if 'eta' in data:
data['eta'] = timedelta(seconds=data['eta'])
if 'eta' in data: data['eta'] = timedelta(seconds=data['eta'])
return cls(**data)


Expand Down Expand Up @@ -53,13 +51,8 @@ def to_dict(self):

@classmethod
def from_dict(cls, data):
# Convert overall_eta from seconds back to timedelta
if 'overall_eta' in data:
data['overall_eta'] = timedelta(seconds=data['overall_eta'])

# Parse file_progress
if 'file_progress' in data:
data['file_progress'] = {k: RepoFileProgressEvent.from_dict(v) for k, v in data['file_progress'].items()}
if 'overall_eta' in data: data['overall_eta'] = timedelta(seconds=data['overall_eta'])
if 'file_progress' in data: data['file_progress'] = {k: RepoFileProgressEvent.from_dict(v) for k, v in data['file_progress'].items()}

return cls(**data)

Expand Down
Empty file added exo/download/hf/__init__.py
Empty file.
5 changes: 4 additions & 1 deletion exo/inference/inference_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

class InferenceEngine(ABC):
@abstractmethod
async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> (np.ndarray, str, bool):
async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> Tuple[np.ndarray, str, bool]:
pass

@abstractmethod
Expand All @@ -27,5 +27,8 @@ def get_inference_engine(inference_engine_name: str, shard_downloader: 'ShardDow
tinygrad.helpers.DEBUG.value = int(os.getenv("TINYGRAD_DEBUG", default="0"))

return TinygradDynamicShardInferenceEngine(shard_downloader)
elif inference_engine_name == "pytorch":
from exo.inference.pytorch.inference import PyTorchDynamicShardInferenceEngine
return PyTorchDynamicShardInferenceEngine(shard_downloader)
else:
raise ValueError(f"Inference engine {inference_engine_name} not supported")
26 changes: 26 additions & 0 deletions exo/inference/pytorch/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# PyTorch & HuggingFace inference engine
Experimental, still under development


## Install
Install needed py modules, make sure to be using CUDA 12.4 for the PyTorch install

```console
$ pip install torch --index-url https://download.pytorch.org/whl/cu124
$ pip install transformers accelerate
```

After installing accelerate you get hit with a dependency error, for now ignore until we can fix this as exo works fine with 1.26.4

```console
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
exo 0.0.1 requires numpy==2.0.0, but you have numpy 1.26.4 which is incompatible.
```

## Low VRAM Notes

- When trying to do disk_offload getting the error "Cannot copy out of meta tensor; no data!", looking up the error it is tied to (low vram)[https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13087#issuecomment-2080272004]

## Multiple GPU in 1 Notes
### Running multiple GPUs on 1 machine
- Getting error "Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1! (when checking argument for argument tensors in method wrapper_CUDA_cat)"
Empty file.
137 changes: 96 additions & 41 deletions exo/inference/pytorch/inference.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,30 @@
# experimental, based off of tinygrad/inference.py

import numpy as np
import torch
import numpy as np
import json
from typing import Optional, Callable, Tuple
from typing import Optional, Tuple
from exo.inference.shard import Shard
from exo.inference.inference_engine import InferenceEngine
from exo.inference.pytorch.model.hf import ShardedHuggingFaceModel
from exo.api.chatgpt_api import resolve_tokenizer
from exo.helpers import DEBUG
from transformers import DynamicCache
from accelerate import disk_offload

class PyTorchDynamicShardInferenceEngine(InferenceEngine):
"""
PyTorch Dynamic Shard Inference Engine for performing model inference with sharded models.
"""

def __init__(self):
def __init__(self, shard):
"""
Initialize the inference engine.

Args:
debug (bool): If True, enables debug logging. Defaults to False.
"""
self.shard = None
self.shard = shard
self.model = None
self.tokenizer = None
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Expand All @@ -37,41 +37,57 @@ async def infer_prompt(
image_str: Optional[str] = None,
inference_state: Optional[str] = None
) -> Tuple[np.ndarray, str, bool]:
if DEBUG >= 2:
print("infer_prompt called")


await self.ensure_shard(shard)

# need to make this so inference_state is not a string
# cant use it with dynamic cache

tokens = self.tokenizer.encode(prompt, return_tensors="pt")
tokens = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
tokens = self.model.embed_tokens(tokens)
current_kvs = None

if DEBUG >= 2:
if DEBUG >= 4:
print("infer_prompt called")
print(f"tokens: {tokens}\n")

output_data = self.model.forward_layers(
tokens
print(f"layer_count: {self.shard.get_layer_count()}")
print(f"is_first_layer: {self.shard.is_first_layer()}")
print(f"is_last_layer: {self.shard.is_last_layer()}")

# convert inference_state or cache from json to DynamicCache
past_kv = DynamicCache()
if inference_state != None:
cache_dict = json.loads(inference_state)
past_kv.key_cache = [torch.tensor(data).to(self.device) for data in cache_dict['key_cache']]
past_kv.value_cache = [torch.tensor(data).to(self.device) for data in cache_dict['value_cache']]

output_data, current_kvs = self.model.forward(
tokens,
past_kv
)

is_finished = output_data.size == 1 and output_data.item() in [self.tokenizer.eos_token_id]

if is_finished:
print(f"token from llm decode: {self.tokenizer.decode(output_data)}")


if DEBUG >= 2:
if DEBUG >= 4:
print(f"output_data: {output_data}\n")
print(f"output_data.size {output_data.size}\n")
print(f"output_data.item() {output_data.item()}")

print(f"finished: {is_finished}")
print(f"self.tokenizer.eos_token_id {self.tokenizer.eos_token_id}")
print(f"output_data[-1] {output_data[-1]}")
print(f"output_data.item() in [self.tokenizer.eos_token_id]: {output_data.item() in [self.tokenizer.eos_token_id]}")

if output_data.size == 1:
print(f"size 1 output_data.item() {output_data.item()}")
print(f"output_data.item() in [self.tokenizer.eos_token_id]: {output_data.item() in [self.tokenizer.eos_token_id]}")

cache_dict = {
'key_cache': [tensor.tolist() for tensor in current_kvs.key_cache],
'value_cache': [tensor.tolist() for tensor in current_kvs.value_cache]
}

return (
output_data,
"",
json.dumps(cache_dict),
is_finished
)

Expand All @@ -80,39 +96,78 @@ async def infer_tensor(
request_id: str,
shard: Shard,
input_data: np.ndarray,
inference_state: Optional[str] = None) -> Tuple[np.ndarray, str, bool]:
inference_state: Optional[str] = None
) -> Tuple[np.ndarray, str, bool]:

in_tensor = torch.tensor(input_data)

# Ensure input_data is 2D: [batch_size, seq_len]
if in_tensor.dim() == 1:
in_tensor = in_tensor.unsqueeze(0) # Add a batch dimension: [1, seq_len]
await self.ensure_shard(shard)

if DEBUG >= 2:
print("infer_tensor called")
print(f"input_data: {input_data}\n")
print(f"in_tensor: {in_tensor}\n")
current_kvs = None

await self.ensure_shard(shard)
if input_data.size == 1:
in_tensor = torch.from_numpy(
input_data,
).unsqueeze(0).long().to(self.device)
else:
in_tensor = torch.from_numpy(
input_data
).long().to(self.device)

in_tensor = self.model.embed_tokens(in_tensor)

output_data = self.model.forward_layers(
in_tensor
if DEBUG >= 4:
print("infer_tensor called")
print(f"input_data: {input_data}")
print(f"input_data.size: {input_data.size}")
print(f"input_tensor: {in_tensor}\n")
print(f"shard: {self.shard}")
print(f"layer_count: {self.shard.get_layer_count()}")
print(f"is_first_layer: {self.shard.is_first_layer()}")
print(f"is_last_layer: {self.shard.is_last_layer()}")

# convert inference_state or cache from json to DynamicCache
past_kv = DynamicCache()
if inference_state != None:
try:
cache_dict = json.loads(inference_state)
past_kv.key_cache = [torch.tensor(data).to(self.device) for data in cache_dict['key_cache']]
past_kv.value_cache = [torch.tensor(data).to(self.device) for data in cache_dict['value_cache']]

if DEBUG >= 4:
print("Loaded past_kv from JSON")
print(f"past_kv: {past_kv}")
print(f"past_kv.key_cache len: {len(past_kv.key_cache)}")
print(f"past_kv.value_cache len: {len(past_kv.value_cache)}")
except json.JSONDecodeError:
print(f"ERROR DECODING INFERENCE STATE")

output_data, current_kvs = self.model.forward(
in_tensor,
past_kv
)

is_finished = output_data.size == 1 and output_data.item() in [self.tokenizer.eos_token_id]

if DEBUG >= 2:
if DEBUG >= 4:
print(f"in_tensor: {in_tensor}\n")
print(f"output_data: {output_data}\n")
print(f"output_data.size {output_data.size}\n")
print(f"output_data.item() {output_data.item()}")
print(f"finished: {is_finished}")
print(f"self.tokenizer.eos_token_id {self.tokenizer.eos_token_id}")
print(f"output_data[-1] {output_data[-1]}")
print(f"output_data.item() in [self.tokenizer.eos_token_id]: {output_data.item() in [self.tokenizer.eos_token_id]}")

if output_data.size == 1:
print(f"size 1 output_data.item() {output_data.item()}")
print(f"output_data.item() in [self.tokenizer.eos_token_id]: {output_data.item() in [self.tokenizer.eos_token_id]}")


cache_dict = {
'key_cache': [tensor.tolist() for tensor in current_kvs.key_cache],
'value_cache': [tensor.tolist() for tensor in current_kvs.value_cache]
}

return (
output_data,
"",
json.dumps(cache_dict),
is_finished
)

Expand All @@ -126,12 +181,12 @@ async def ensure_shard(self, shard: Optional[Shard]):
if self.shard == shard:
return

if DEBUG >= 2:
if DEBUG >= 4:
print(f"Loading new shard: {shard}")

self.model = ShardedHuggingFaceModel(shard)
self.tokenizer = await resolve_tokenizer(shard.model_id)
self.shard = shard
self.tokenizer = await resolve_tokenizer(shard.model_id)
self.model = ShardedHuggingFaceModel(shard, self.tokenizer)

if DEBUG >= 2:
if DEBUG >= 4:
print(f"Shard loaded successfully: {shard}")
Empty file.
Loading