exo-explore · risingsunomi · Aug 11, 2024 · Aug 11, 2024 · Aug 11, 2024 · Aug 11, 2024
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -178,6 +178,50 @@ jobs:
           inference_engine: mlx
           model_id: llama-3.2-1b
 
+  chatgpt_api_integration_test_torch_linux_cpu:
+    machine:
+      image: ubuntu-2404:2024.08.1
+    resource_class: large
+    steps:
+      - checkout
+      - run:
+          name: Set up Python
+          command: |
+            brew install [email protected]
+            python3.12 -m venv env
+            source env/bin/activate
+      - run:
+          name: Install dependencies
+          command: |
+            source env/bin/activate
+            pip install --upgrade pip
+            pip install .
+      - run_chatgpt_api_test:
+          inference_engine: torch
+          model_id: llama-3.2-1b
+
+  chatgpt_api_integration_test_torch_mac:
+    macos:
+      xcode: "15.4.0"
+    resource_class: macos.m1.large.gen1
+    steps:
+      - checkout
+      - run:
+          name: Set up Python
+          command: |
+            brew install [email protected]
+            python3.12 -m venv env
+            source env/bin/activate
+      - run:
+          name: Install dependencies
+          command: |
+            source env/bin/activate
+            pip install --upgrade pip
+            pip install .
+      - run_chatgpt_api_test:
+          inference_engine: torch
+          model_id: llama-3.2-1b
+
   test_macos_m1:
     macos:
       xcode: "16.0.0"
@@ -211,9 +255,72 @@ jobs:
 workflows:
   version: 2
   build_and_test:
+    jobs:
+      - approve_run:
+          type: approval
+          requires: []
+          filters:
+            branches:
+              ignore: main
+      - unit_test:
+          requires:
+            - approve_run
+      - discovery_integration_test:
+          requires:
+            - approve_run
+      - chatgpt_api_integration_test_mlx:
+          requires:
+            - approve_run
+      - test_macos_m1:
+          requires:
+            - approve_run
+      - chatgpt_api_integration_test_torch_linux_cpu:
+          requires:
+            - approve_run
+      - chatgpt_api_integration_test_torch_mac:
+          requires:
+            - approve_run
+
+  # Workflow for forked PRs without approval
+  forked_pr_workflow:
     jobs:
       - unit_test
       - discovery_integration_test
       - chatgpt_api_integration_test_mlx
       - test_macos_m1
-      # - chatgpt_api_integration_test_tinygrad
+      - chatgpt_api_integration_test_torch_linux_cpu
+      - chatgpt_api_integration_test_torch_mac
+    # The trigger condition ensures this workflow runs for forked PRs
+    triggers:
+      - type: pull_request
+        filters:
+          branches:
+            ignore: main
+
+  # Existing workflow for main branch
+  main_branch_workflow:
+    jobs:
+      - unit_test:
+          filters:
+            branches:
+              only: main
+      - discovery_integration_test:
+          filters:
+            branches:
+              only: main
+      - chatgpt_api_integration_test_mlx:
+          filters:
+            branches:
+              only: main
+      - test_macos_m1:
+          filters:
+            branches:
+              only: main
+      - chatgpt_api_integration_test_torch_linux_cpu:
+          filters:
+            branches:
+              only: main
+      - chatgpt_api_integration_test_torch_mac:
+          filters:
+            branches:
+              only: main
diff --git a/.gitignore b/.gitignore
@@ -170,3 +170,9 @@ cython_debug/
 #.idea/
 
 **/*.xcodeproj/*
+
+# PyTorch interface
+.offload
+
+# neovim/vim settings 
+.vimrc
diff --git a/exo/api/chatgpt_api.py b/exo/api/chatgpt_api.py
@@ -17,7 +17,6 @@
 from exo.models import model_base_shards
 from typing import Callable
 
-
 class Message:
   def __init__(self, role: str, content: Union[str, List[Dict[str, Union[str, Dict[str, str]]]]]):
     self.role = role
@@ -60,6 +59,9 @@ def generate_completion(
       "finish_reason": finish_reason,
     }],
   }
+
+  if DEBUG >= 3:
+    print(f"completion: {completion}")
 
   if not stream:
     completion["usage"] = {

diff --git a/exo/helpers.py b/exo/helpers.py
@@ -33,7 +33,6 @@ def get_system_info():
     return "Linux"
   return "Non-Mac, non-Linux system"
 
-
 def find_available_port(host: str = "", min_port: int = 49152, max_port: int = 65535) -> int:
   used_ports_file = os.path.join(tempfile.gettempdir(), "exo_used_ports")
 

diff --git a/exo/inference/inference_engine.py b/exo/inference/inference_engine.py
@@ -8,7 +8,7 @@
 
 class InferenceEngine(ABC):
   @abstractmethod
-  async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> (np.ndarray, str, bool):
+  async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> Tuple[np.ndarray, str, bool]:
     pass
 
   @abstractmethod
@@ -27,5 +27,8 @@ def get_inference_engine(inference_engine_name: str, shard_downloader: 'ShardDow
     tinygrad.helpers.DEBUG.value = int(os.getenv("TINYGRAD_DEBUG", default="0"))
 
     return TinygradDynamicShardInferenceEngine(shard_downloader)
+  elif inference_engine_name == "torch":
+    from exo.inference.torch.inference import TorchDynamicShardInferenceEngine
+    return TorchDynamicShardInferenceEngine(shard_downloader)
   else:
     raise ValueError(f"Inference engine {inference_engine_name} not supported")
diff --git a/exo/inference/torch/.gitignore b/exo/inference/torch/.gitignore
@@ -0,0 +1,2 @@
+data/
+model/archive/
diff --git a/exo/inference/torch/README.md b/exo/inference/torch/README.md
@@ -0,0 +1,9 @@
+# PyTorch & HuggingFace inference engine
+
+## Notes/Issues
+### 10/10/2024
+- To select a pytorch device via environment variables, set the variable TORCH_DEVICE
+  - XLA is currently not installed and will need to be added to inference.py, looking into doing this on a TPU VM
+  - With pytorch, CUDA and ROCm are the same so specifying CUDA also enables ROCm support. See this [post](https://github.com/pytorch/pytorch/issues/55223#issuecomment-812587373)
+  - Looking into adding mobile device support properly
+- If device is not CPU the data type defaults to float32 else float16.
diff --git a/exo/inference/torch/__init__.py b/exo/inference/torch/__init__.py