Refactor mlc_chat into a formal package (octoml#266)

tqchen · web-flow · commit 096c8a50083b · 2023-05-29T18:00:06.000-04:00
This PR refactors the mlc-chat into a formal package
Still need some followup TODOs on cleaning up
the rest and gradio API.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -114,15 +114,11 @@ else()
   target_link_libraries(mlc_chat_cli PUBLIC mlc_llm)
 endif()
 
-if (UNIX OR APPLE)
-  add_library(mlc_llm_module MODULE $<TARGET_OBJECTS:mlc_llm_objs>)
-  target_link_libraries(mlc_llm_module PRIVATE tokenizers_cpp)
-  if (APPLE)
-    set_property(TARGET mlc_llm_module APPEND PROPERTY LINK_OPTIONS -undefined dynamic_lookup)
-  else()
-    set_property(TARGET mlc_llm_module APPEND PROPERTY LINK_OPTIONS)
-  endif()
-endif()
+# create a dummy libtvms.so, so mlc_llm_module can be loaded by tvm smoothly
+add_library(libtvm_dummy SHARED $<TARGET_OBJECTS:tvm_runtime_objs> $<TARGET_OBJECTS:tvm_libinfo_objs>)
+set_target_properties(libtvm_dummy PROPERTIES OUTPUT_NAME "tvm")
+add_library(mlc_llm_module MODULE $<TARGET_OBJECTS:mlc_llm_objs>)
+target_link_libraries(mlc_llm_module PRIVATE tokenizers_cpp libtvm_dummy)
 
 # when this option is on,
 # we install all static lib deps into lib
@@ -145,7 +141,7 @@ if (MLC_LLM_INSTALL_STATIC_LIB)
       )
   endif()
 else()
-  install(TARGETS mlc_chat_cli tvm_runtime mlc_llm
+  install(TARGETS mlc_chat_cli tvm_runtime mlc_llm mlc_llm_module
     mlc_llm_static
     tokenizers_cpp
     sentencepiece-static
diff --git a/mlc_llm/utils.py b/mlc_llm/utils.py
@@ -437,7 +437,7 @@ def compile_metal(src, target):
                     "supports_int8": 1,
                     "supports_8bit_buffer": 1,
                     "supports_16bit_buffer": 1,
-                    "supports_storage_buffer_storage_class": 1
+                    "supports_storage_buffer_storage_class": 1,
                 }
             ),
             host="llvm",
diff --git a/python/README.md b/python/README.md
@@ -19,4 +19,4 @@ There is currently a dependency to build from source in order to use the [REST A
 
 To launch the Gradio API, locate to the root of the mlc-llm repo. The arguments you need to provide are `artifact-path` which stores your pre-built models, `device-name` (default cuda) and `device-id` (default 0). After launched, the Gradio API allows you to select different models and quantization types in its interface.
 
-    python3 -m python.mlc_chat.gradio --artifact-path /path/to/your/models --device-name cuda --device-id 0
+    PYTHONPATH=python python3 -m mlc_chat.gradio --artifact-path /path/to/your/models --device-name cuda --device-id 0
diff --git a/python/mlc_chat/__init__.py b/python/mlc_chat/__init__.py
@@ -0,0 +1,6 @@
+"""MLC Chat python package.
+
+MLC Chat is the app runtime of MLC LLM.
+"""
+from .libinfo import __version__
+from .chat_module import ChatModule
diff --git a/python/mlc_chat/chat_module.py b/python/mlc_chat/chat_module.py
@@ -1,12 +1,24 @@
 """Python runtime for MLC chat."""
-
+#! pylint: disable=unused-import
+import os
+import sys
 import ctypes
-
 import tvm
+import tvm._ffi.base
+from . import libinfo
+
+
+def _load_mlc_llm_lib():
+    """Load mlc llm lib"""
+    if sys.platform.startswith("win32") and sys.version_info >= (3, 8):
+        for path in libinfo.get_dll_directories():
+            os.add_dll_directory(path)
+    lib_name = "mlc_llm" if tvm._ffi.base._RUNTIME_ONLY else "mlc_llm_module"
+    lib_path = libinfo.find_lib_path(lib_name, optional=False)
+    return ctypes.CDLL(lib_path[0]), lib_path[0]
 
 
-def load_llm_chat(mlc_lib_path):
-    return ctypes.CDLL(mlc_lib_path)
+_LIB, _LIB_PATH = _load_mlc_llm_lib()
 
 
 def supported_models():
@@ -17,9 +29,8 @@ def quantization_keys():
     return ["q3f16_0", "q4f16_0", "q4f32_0", "q0f32", "q0f16"]
 
 
-class LLMChatModule:
+class ChatModule:
     def __init__(self, mlc_lib_path, target="cuda", device_id=0):
-        load_llm_chat(mlc_lib_path)
         fcreate = tvm.get_global_func("mlc.llm_chat_create")
         assert fcreate is not None
         if target == "cuda":
diff --git a/python/mlc_chat/gradio.py b/python/mlc_chat/gradio.py
@@ -7,7 +7,7 @@
 import gradio as gr
 import tvm
 
-from python.mlc_chat.chat_module import LLMChatModule
+from .chat_module import ChatModule
 
 model_keys = ["vicuna-v1-7b"]
 quantization_keys = ["q3f16_0", "q4f16_0", "q4f32_0", "q0f32", "q0f16"]
@@ -23,7 +23,7 @@ def _parse_args():
     return parsed
 
 
-class GradioChatModule(LLMChatModule):
+class GradioChatModule(ChatModule):
     def __init__(self, ARGS):
         super().__init__(ARGS.mlc_lib_path, ARGS.device_name, ARGS.device_id)
         self.artifact_path = ARGS.artifact_path
diff --git a/python/mlc_chat/libinfo.py b/python/mlc_chat/libinfo.py
@@ -0,0 +1,70 @@
+"""Library information. This is a standalone file that can be used to get various info"""
+#! pylint: disable=protected-access
+import os
+import sys
+
+__version__ = "0.1.dev0"
+
+
+def get_env_paths(env_var, splitter):
+    """Get path in env variable"""
+    if os.environ.get(env_var, None):
+        return [p.strip() for p in os.environ[env_var].split(splitter)]
+    return []
+
+
+def get_dll_directories():
+    """Get extra mlc llm dll directories"""
+    curr_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
+    source_dir = os.path.abspath(os.path.join(curr_dir, "..", ".."))
+    dll_path = [
+        curr_dir,
+        os.path.join(source_dir, "build"),
+        os.path.join(source_dir, "build", "Release"),
+    ]
+
+    if "MLC_LIBRARY_PATH" in os.environ:
+        dll_path.append(os.environ["MLC_LIBRARY_PATH"])
+
+    if sys.platform.startswith("linux") or sys.platform.startswith("freebsd"):
+        dll_path.extend(get_env_paths("LD_LIBRARY_PATH", ":"))
+    elif sys.platform.startswith("darwin"):
+        dll_path.extend(get_env_paths("DYLD_LIBRARY_PATH", ":"))
+    elif sys.platform.startswith("win32"):
+        dll_path.extend(get_env_paths("PATH", ";"))
+
+    return dll_path
+
+
+def find_lib_path(name, optional=False):
+    """Find mlc llm library
+
+    Parameters
+    ----------
+    name : str
+        The name of the library
+
+    optional: boolean
+        Whether the library is required
+    """
+    if sys.platform.startswith("linux") or sys.platform.startswith("freebsd"):
+        lib_name = f"lib{name}.so"
+    elif sys.platform.startswith("win32"):
+        lib_name = f"{name}.dll"
+    elif sys.platform.startswith("darwin"):
+        lib_name = f"lib{name}.dylib"
+    else:
+        lib_name = f"lib{name}.so"
+
+    dll_paths = get_dll_directories()
+    lib_dll_path = [os.path.join(p, lib_name) for p in dll_paths]
+    lib_found = [p for p in lib_dll_path if os.path.exists(p) and os.path.isfile(p)]
+    if not lib_found:
+        if not optional:
+            message = (
+                f"Cannot find libraries: {lib_name}\n"
+                + "List of candidates:\n"
+                + "\n".join(lib_dll_path)
+            )
+            raise RuntimeError(message)
+    return lib_found
diff --git a/python/mlc_chat/rest.py b/python/mlc_chat/rest.py
@@ -1,4 +1,4 @@
-from chat_module import LLMChatModule, supported_models, quantization_keys
+from .chat_module import ChatModule, supported_models, quantization_keys
 
 from pydantic import BaseModel
 from fastapi import FastAPI, HTTPException
@@ -15,20 +15,14 @@
 
 session = {}
 
+
 @asynccontextmanager
 async def lifespan(app: FastAPI):
 
     ARGS = _parse_args()
 
-    chat_mod = LLMChatModule(
-        ARGS.mlc_lib_path,
-        ARGS.device_name,
-        ARGS.device_id
-    )
-    model_path = os.path.join(
-        ARGS.artifact_path,
-        ARGS.model + "-" + ARGS.quantization
-    )
+    chat_mod = ChatModule(ARGS.mlc_lib_path, ARGS.device_name, ARGS.device_id)
+    model_path = os.path.join(ARGS.artifact_path, ARGS.model + "-" + ARGS.quantization)
     model_dir = ARGS.model + "-" + ARGS.quantization
     model_lib = model_dir + "-" + ARGS.device_name + ".so"
     lib_dir = os.path.join(model_path, model_lib)
@@ -38,16 +32,22 @@ async def lifespan(app: FastAPI):
     elif os.path.exists(prebuilt_lib_dir):
         lib = tvm.runtime.load_module(prebuilt_lib_dir)
     else:
-        raise ValueError(f"Unable to find {model_lib} at {lib_dir} or {prebuilt_lib_dir}.")
+        raise ValueError(
+            f"Unable to find {model_lib} at {lib_dir} or {prebuilt_lib_dir}."
+        )
 
     local_model_path = os.path.join(model_path, "params")
-    prebuilt_model_path =  os.path.join(ARGS.artifact_path, "prebuilt", f"mlc-chat-{model_dir}")
+    prebuilt_model_path = os.path.join(
+        ARGS.artifact_path, "prebuilt", f"mlc-chat-{model_dir}"
+    )
     if os.path.exists(local_model_path):
         chat_mod.reload(lib=lib, model_path=local_model_path)
     elif os.path.exists(prebuilt_model_path):
         chat_mod.reload(lib=lib, model_path=prebuilt_model_path)
     else:
-        raise ValueError(f"Unable to find model params at {local_model_path} or {prebuilt_model_path}.")
+        raise ValueError(
+            f"Unable to find model params at {local_model_path} or {prebuilt_model_path}."
+        )
     session["chat_mod"] = chat_mod
 
     yield
@@ -57,13 +57,11 @@ async def lifespan(app: FastAPI):
 
 app = FastAPI(lifespan=lifespan)
 
+
 def _parse_args():
     args = argparse.ArgumentParser()
     args.add_argument(
-        "--model",
-        type=str,
-        choices=supported_models(),
-        default="vicuna-v1-7b"
+        "--model", type=str, choices=supported_models(), default="vicuna-v1-7b"
     )
     args.add_argument("--artifact-path", type=str, default="dist")
     args.add_argument(
@@ -85,65 +83,74 @@ def _parse_args():
 """
 List the currently supported models and provides basic information about each of them.
 """
+
+
 @app.get("/models")
 async def read_models():
-    return {
-        "data": [{
-            "id": model, 
-            "object":"model"
-        } for model in supported_models()]
-    }
+    return {"data": [{"id": model, "object": "model"} for model in supported_models()]}
+
 
 """
 Retrieve a model instance with basic information about the model.
 """
+
+
 @app.get("/models/{model}")
 async def read_model(model: str):
     if model not in supported_models():
         raise HTTPException(status_code=404, detail=f"Model {model} is not supported.")
-    return {
-        "id": model, 
-        "object":"model"
-    }
+    return {"id": model, "object": "model"}
+
 
 class ChatRequest(BaseModel):
     prompt: str
     stream: bool = False
 
+
 """
 Creates model response for the given chat conversation.
 """
+
+
 @app.post("/chat/completions")
 def request_completion(request: ChatRequest):
     session["chat_mod"].prefill(input=request.prompt)
     if request.stream:
+
         def iter_response():
             while not session["chat_mod"].stopped():
                 session["chat_mod"].decode()
                 msg = session["chat_mod"].get_message()
                 yield json.dumps({"message": msg})
-        return StreamingResponse(iter_response(), media_type='application/json')
+
+        return StreamingResponse(iter_response(), media_type="application/json")
     else:
         msg = None
         while not session["chat_mod"].stopped():
             session["chat_mod"].decode()
             msg = session["chat_mod"].get_message()
         return {"message": msg}
 
+
 """
 Reset the chat for the currently initialized model.
 """
+
+
 @app.post("/chat/reset")
 def reset():
     session["chat_mod"].reset_chat()
 
+
 """
 Get the runtime stats.
 """
+
+
 @app.get("/stats")
 def read_stats():
     return session["chat_mod"].runtime_stats_text()
 
 
 if __name__ == "__main__":
-    uvicorn.run("server:app", port=8000, reload=True, access_log=False)
+    uvicorn.run("mlc_chat.server:app", port=8000, reload=True, access_log=False)
diff --git a/python/setup.py b/python/setup.py

Original file line number	Diff line number	Diff line change
`@@ -437,7 +437,7 @@ def compile_metal(src, target):`
`437`	`437`	`"supports_int8": 1,`
`438`	`438`	`"supports_8bit_buffer": 1,`
`439`	`439`	`"supports_16bit_buffer": 1,`
`440`		`- "supports_storage_buffer_storage_class": 1`
	`440`	`+ "supports_storage_buffer_storage_class": 1,`
`441`	`441`	`}`
`442`	`442`	`),`
`443`	`443`	`host="llvm",`
Original file line number	Diff line number	Diff line change
`@@ -19,4 +19,4 @@ There is currently a dependency to build from source in order to use the [REST A`
`19`	`19`
`20`	`20`	To launch the Gradio API, locate to the root of the mlc-llm repo. The arguments you need to provide are `artifact-path` which stores your pre-built models, `device-name` (default cuda) and `device-id` (default 0). After launched, the Gradio API allows you to select different models and quantization types in its interface.
`21`	`21`
`22`		`- python3 -m python.mlc_chat.gradio --artifact-path /path/to/your/models --device-name cuda --device-id 0`
	`22`	`+ PYTHONPATH=python python3 -m mlc_chat.gradio --artifact-path /path/to/your/models --device-name cuda --device-id 0`