risingsunomi
diff --git a/‎.gitignore
+3 b/‎.gitignore
+3
diff --git a/‎exo/api/chatgpt_api.py
+20-1 b/‎exo/api/chatgpt_api.py
+20-1
diff --git a/‎exo/download/__init__.py b/‎exo/download/__init__.py
diff --git a/‎exo/download/hf/__init__.py b/‎exo/download/hf/__init__.py
diff --git a/‎exo/inference/inference_engine.py
+4-1 b/‎exo/inference/inference_engine.py
+4-1
diff --git a/‎exo/inference/pytorch/README.md
+26 b/‎exo/inference/pytorch/README.md
+26
diff --git a/‎exo/inference/pytorch/__init__.py b/‎exo/inference/pytorch/__init__.py
diff --git a/‎exo/inference/pytorch/inference.py
+96-41 b/‎exo/inference/pytorch/inference.py
+96-41
diff --git a/‎exo/inference/pytorch/model/__init__.py b/‎exo/inference/pytorch/model/__init__.py
@@ -170,3 +170,6 @@ cython_debug/
 #.idea/
 
 **/*.xcodeproj/*
+
+# PyTorch interface
+.offload
@@ -113,8 +113,27 @@ def remap_messages(messages: List[Message]) -> List[Message]:
 
 
 def build_prompt(tokenizer, _messages: List[Message]):
+  if len(_messages) == 1:
+    user_msg = _messages[0]
+
+    # get instruct sys message
+    sys_msg = Message(role="system", content="You are a helpful assistant.")
+
+    # restructure for sys_msg to go first
+    _messages = [sys_msg, user_msg]
+
   messages = remap_messages(_messages)
-  prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+  prompt = tokenizer.apply_chat_template(
+    messages, 
+    tokenize=False, 
+    add_generation_prompt=True
+  )
+
+  if DEBUG >= 3:
+    print(f"prompt: {str(prompt)}")
+    for msg in messages:
+      print(f"chat role: {msg.role}\ncontent: {msg.content}")
+
   image_str = None
   for message in messages:
     if not isinstance(message.content, list):
 
@@ -8,7 +8,7 @@
 
 class InferenceEngine(ABC):
   @abstractmethod
-  async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> (np.ndarray, str, bool):
+  async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> Tuple[np.ndarray, str, bool]:
     pass
 
   @abstractmethod
@@ -27,5 +27,8 @@ def get_inference_engine(inference_engine_name: str, shard_downloader: 'ShardDow
     tinygrad.helpers.DEBUG.value = int(os.getenv("TINYGRAD_DEBUG", default="0"))
 
     return TinygradDynamicShardInferenceEngine(shard_downloader)
+  elif inference_engine_name == "pytorch":
+    from exo.inference.pytorch.inference import PyTorchDynamicShardInferenceEngine
+    return PyTorchDynamicShardInferenceEngine(shard_downloader)
   else:
     raise ValueError(f"Inference engine {inference_engine_name} not supported")
@@ -0,0 +1,26 @@
+# PyTorch & HuggingFace inference engine
+Experimental, still under development
+
+
+## Install
+Install needed py modules, make sure to be using CUDA 12.4 for the PyTorch install
+
+```console
+$ pip install torch --index-url https://download.pytorch.org/whl/cu124
+$ pip install transformers accelerate
+```
+
+After installing accelerate you get hit with a dependency error, for now ignore until we can fix this as exo works fine with 1.26.4
+
+```console
+ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
+exo 0.0.1 requires numpy==2.0.0, but you have numpy 1.26.4 which is incompatible.
+```
+
+## Low VRAM Notes
+
+- When trying to do disk_offload getting the error "Cannot copy out of meta tensor; no data!", looking up the error it is tied to (low vram)[https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13087#issuecomment-2080272004]
+
+## Multiple GPU in 1 Notes
+### Running multiple GPUs on 1 machine
+- Getting error "Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1! (when checking argument for argument tensors in method wrapper_CUDA_cat)"
@@ -1,30 +1,30 @@
 # experimental, based off of tinygrad/inference.py
-
 import numpy as np
 import torch
 import numpy as np
 import json
-from typing import Optional, Callable, Tuple
+from typing import Optional, Tuple
 from exo.inference.shard import Shard
 from exo.inference.inference_engine import InferenceEngine
 from exo.inference.pytorch.model.hf import ShardedHuggingFaceModel
 from exo.api.chatgpt_api import resolve_tokenizer
 from exo.helpers import DEBUG
 from transformers import DynamicCache
+from accelerate import disk_offload
 
 class PyTorchDynamicShardInferenceEngine(InferenceEngine):
     """
     PyTorch Dynamic Shard Inference Engine for performing model inference with sharded models.
     """
 
-    def __init__(self):
+    def __init__(self, shard):
         """
         Initialize the inference engine.
 
         Args:
             debug (bool): If True, enables debug logging. Defaults to False.
         """
-        self.shard = None
+        self.shard = shard
         self.model = None
         self.tokenizer = None
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -37,41 +37,57 @@ async def infer_prompt(
         image_str: Optional[str] = None, 
         inference_state: Optional[str] = None
     ) -> Tuple[np.ndarray, str, bool]:
-        if DEBUG >= 2:
-            print("infer_prompt called")
-
+        
         await self.ensure_shard(shard)
 
         # need to make this so inference_state is not a string
         # cant use it with dynamic cache
 
-        tokens = self.tokenizer.encode(prompt, return_tensors="pt")
+        tokens = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
+        tokens = self.model.embed_tokens(tokens)
+        current_kvs = None
 
-        if DEBUG >= 2:
+        if DEBUG >= 4:
+            print("infer_prompt called")
             print(f"tokens: {tokens}\n")
-
-        output_data = self.model.forward_layers(
-            tokens
+            print(f"layer_count: {self.shard.get_layer_count()}")
+            print(f"is_first_layer: {self.shard.is_first_layer()}")
+            print(f"is_last_layer: {self.shard.is_last_layer()}")
+        
+        # convert inference_state or cache from json to DynamicCache
+        past_kv = DynamicCache()
+        if inference_state != None:
+            cache_dict = json.loads(inference_state)
+            past_kv.key_cache = [torch.tensor(data).to(self.device) for data in cache_dict['key_cache']]
+            past_kv.value_cache = [torch.tensor(data).to(self.device) for data in cache_dict['value_cache']]
+            
+        output_data, current_kvs = self.model.forward(
+            tokens,
+            past_kv
         )
 
         is_finished = output_data.size == 1 and output_data.item() in [self.tokenizer.eos_token_id]
 
-        if is_finished:
-            print(f"token from llm decode: {self.tokenizer.decode(output_data)}")
-
-
-        if DEBUG >= 2:
+        if DEBUG >= 4:
             print(f"output_data: {output_data}\n")
             print(f"output_data.size {output_data.size}\n")
-            print(f"output_data.item() {output_data.item()}")
+            
             print(f"finished: {is_finished}")
             print(f"self.tokenizer.eos_token_id {self.tokenizer.eos_token_id}")
             print(f"output_data[-1] {output_data[-1]}")
-            print(f"output_data.item() in [self.tokenizer.eos_token_id]: {output_data.item() in [self.tokenizer.eos_token_id]}")
+
+            if output_data.size == 1:
+                print(f"size 1 output_data.item() {output_data.item()}")
+                print(f"output_data.item() in [self.tokenizer.eos_token_id]: {output_data.item() in [self.tokenizer.eos_token_id]}")
+
+        cache_dict = {
+            'key_cache': [tensor.tolist() for tensor in current_kvs.key_cache],
+            'value_cache': [tensor.tolist() for tensor in current_kvs.value_cache]
+        }
 
         return (
             output_data,
-            "",
+            json.dumps(cache_dict),
             is_finished
         )
 
@@ -80,39 +96,78 @@ async def infer_tensor(
         request_id: str, 
         shard: Shard, 
         input_data: np.ndarray, 
-        inference_state: Optional[str] = None) -> Tuple[np.ndarray, str, bool]:
+        inference_state: Optional[str] = None
+    ) -> Tuple[np.ndarray, str, bool]:
 
-        in_tensor = torch.tensor(input_data)
-    
-        # Ensure input_data is 2D: [batch_size, seq_len]
-        if in_tensor.dim() == 1:
-            in_tensor = in_tensor.unsqueeze(0)  # Add a batch dimension: [1, seq_len]
+        await self.ensure_shard(shard)
 
-        if DEBUG >= 2:
-            print("infer_tensor called")
-            print(f"input_data: {input_data}\n")
-            print(f"in_tensor: {in_tensor}\n")
+        current_kvs = None
 
-        await self.ensure_shard(shard)
+        if input_data.size == 1:
+            in_tensor = torch.from_numpy(
+                input_data,
+            ).unsqueeze(0).long().to(self.device)
+        else:
+            in_tensor = torch.from_numpy(
+                input_data
+            ).long().to(self.device)
+
+        in_tensor = self.model.embed_tokens(in_tensor)
 
-        output_data = self.model.forward_layers(
-            in_tensor
+        if DEBUG >= 4:
+            print("infer_tensor called")
+            print(f"input_data: {input_data}")
+            print(f"input_data.size: {input_data.size}")
+            print(f"input_tensor: {in_tensor}\n")
+            print(f"shard: {self.shard}")
+            print(f"layer_count: {self.shard.get_layer_count()}")
+            print(f"is_first_layer: {self.shard.is_first_layer()}")
+            print(f"is_last_layer: {self.shard.is_last_layer()}")
+            
+        # convert inference_state or cache from json to DynamicCache
+        past_kv = DynamicCache()
+        if inference_state != None:
+            try:
+                cache_dict = json.loads(inference_state)
+                past_kv.key_cache = [torch.tensor(data).to(self.device) for data in cache_dict['key_cache']]
+                past_kv.value_cache = [torch.tensor(data).to(self.device) for data in cache_dict['value_cache']]
+
+                if DEBUG >= 4:
+                    print("Loaded past_kv from JSON")
+                    print(f"past_kv: {past_kv}")
+                    print(f"past_kv.key_cache len: {len(past_kv.key_cache)}")
+                    print(f"past_kv.value_cache len: {len(past_kv.value_cache)}")
+            except json.JSONDecodeError:
+                print(f"ERROR DECODING INFERENCE STATE")
+
+        output_data, current_kvs = self.model.forward(
+            in_tensor,
+            past_kv
         )
 
         is_finished = output_data.size == 1 and output_data.item() in [self.tokenizer.eos_token_id]
 
-        if DEBUG >= 2:
+        if DEBUG >= 4:
+            print(f"in_tensor: {in_tensor}\n")
             print(f"output_data: {output_data}\n")
             print(f"output_data.size {output_data.size}\n")
-            print(f"output_data.item() {output_data.item()}")
             print(f"finished: {is_finished}")
             print(f"self.tokenizer.eos_token_id {self.tokenizer.eos_token_id}")
             print(f"output_data[-1] {output_data[-1]}")
-            print(f"output_data.item() in [self.tokenizer.eos_token_id]: {output_data.item() in [self.tokenizer.eos_token_id]}")
+
+            if output_data.size == 1:
+                print(f"size 1 output_data.item() {output_data.item()}")
+                print(f"output_data.item() in [self.tokenizer.eos_token_id]: {output_data.item() in [self.tokenizer.eos_token_id]}")
+
+        
+        cache_dict = {
+            'key_cache': [tensor.tolist() for tensor in current_kvs.key_cache],
+            'value_cache': [tensor.tolist() for tensor in current_kvs.value_cache]
+        }
 
         return (
             output_data,
-            "",
+            json.dumps(cache_dict),
             is_finished
         )
 
@@ -126,12 +181,12 @@ async def ensure_shard(self, shard: Optional[Shard]):
         if self.shard == shard:
             return
 
-        if DEBUG >= 2:
+        if DEBUG >= 4:
             print(f"Loading new shard: {shard}")
 
-        self.model = ShardedHuggingFaceModel(shard)
-        self.tokenizer = await resolve_tokenizer(shard.model_id)
         self.shard = shard
+        self.tokenizer = await resolve_tokenizer(shard.model_id)
+        self.model = ShardedHuggingFaceModel(shard, self.tokenizer)
 
-        if DEBUG >= 2:
+        if DEBUG >= 4:
             print(f"Shard loaded successfully: {shard}")