Initial revision of the LLVMActor.

sobomax · sobomax · commit 993fb824d90d · 2024-12-31T05:52:00.000Z
diff --git a/Cluster/InfernLLMActor.py b/Cluster/InfernLLMActor.py
@@ -0,0 +1,67 @@
+from typing import Dict
+from uuid import UUID
+from queue import Queue
+
+import ray
+
+from Cluster.InfernLLMWorker import InfernLLMWorker
+from Cluster.LLMSession import LLMSession, LLMRequest, LLMInferRequest
+
+@ray.remote(num_gpus=1.0, resources={"llm": 1})
+class InfernLLMActor():
+    debug = True
+    sessions: Dict[UUID, LLMSession]
+    LLM: InfernLLMWorker
+
+    def __init__(self):
+        super().__init__()
+        self.sessions = {}
+
+    def start(self):
+        for device in ('xpu', 'cuda', 'cpu'):
+            try:
+                self.llm = InfernLLMWorker(device)
+            except (ValueError, RuntimeError):
+                continue
+            break
+        else:
+            raise RuntimeError('Failed to initialize LLM')
+        self.llm.start()
+        tq = Queue()
+        def res_cb(result): tq.put(result)
+        irs = tuple(LLMInferRequest(LLMRequest('What is your name?', None), [{}])
+                    for _ in range(self.llm.max_batch_size))
+        for _i in irs: _i.textout_cb = res_cb
+        with self.llm.inf_queue.mutex:
+            for ir in irs:
+                self.llm.inf_queue.queue.append(ir)
+            self.llm.inf_queue.not_empty.notify()
+        for _ in irs:
+            tq.get()
+
+    def stop(self):
+        self.llm.stop()
+
+    def new_llm_session(self):
+        if self.debug: print('InfernLLMActor.new_llm_session')
+        sess = LLMSession(self.llm)
+        self.sessions[sess.id] = sess
+        return sess.id
+
+    def llm_session_end(self, sess_id):
+        if self.debug: print('InfernLLMActor.llm_session_end')
+        sess = self.sessions[sess_id]
+        sess.stop()
+        del self.sessions[sess_id]
+
+    def llm_session_textin(self, sess_id, req:LLMRequest):
+        if self.debug: print('InfernLLMActor.llm_session_textin')
+        sess = self.sessions[sess_id]
+        sess.textin(req)
+        return sess_id
+
+    def llm_session_context_add(self, sess_id, content:str, role:str = 'user'):
+        if self.debug: print('InfernLLMActor.llm_session_context_add')
+        sess = self.sessions[sess_id]
+        sess.context_add(content, role)
+        return sess_id
diff --git a/Cluster/InfernLLMWorker.py b/Cluster/InfernLLMWorker.py
@@ -0,0 +1,119 @@
+from typing import Tuple, List, Iterator
+from os.path import exists as path_exists
+from itertools import chain
+from functools import partial
+
+import torch
+import torch.nn.functional as F
+
+from transformers import TextStreamer
+
+from Cluster.InfernBatchedWorker import InfernBatchedWorker
+from Cluster.InfernTTSWorker import get_torch_hw
+from Cluster.LLMSession import LLMResult, LLMInferRequest
+
+class ResultsStreamer(TextStreamer):
+    debug = False
+    sync_on = ('. ', '? ', '! ', '\n')
+    decode_batch_size = 8
+    def __init__(self, wis:List[LLMInferRequest], upper:'InfernLLMWorker'):
+        super().__init__(tokenizer=upper.llm_tokenizer)
+        self.wi_cbs = tuple(wi.textout_cb for wi in wis)
+        self.newLLMResult = tuple(partial(LLMResult, req_id=wi.req.id) for wi in wis)
+        batch_size = len(wis)
+        self.oposs = [0 for _ in range(batch_size)]
+        self.current_tokens = None
+        self.batch_decode = partial(upper.llm_tokenizer.batch_decode, skip_special_tokens=True)
+
+    def put(self, token_ids):
+        if self.current_tokens is None:
+            self.current_tokens = torch.zeros((token_ids.shape[0], 0), dtype=torch.long)
+            return
+        if token_ids.dim() == 1:  # Shape [batch_size]
+            token_ids = token_ids.unsqueeze(1)
+        self.current_tokens = torch.cat([self.current_tokens, token_ids], dim=1)
+        if self.current_tokens.shape[1] % self.decode_batch_size == 0:
+            return
+        results = self.batch_decode(self.current_tokens)
+        for (ir, r), op, cb, newLR in zip(enumerate(results), self.oposs, self.wi_cbs, self.newLLMResult):
+            new_content = r[op:]
+            if len(new_content) == 0: continue
+            sp = (op + pos + len(c) for c in self.sync_on if (pos:=new_content.rfind(c)) >= 0)
+            try:
+                spos = next(sp)
+            except StopIteration:
+                continue
+            r = r[op:spos-1]
+            if len(r) < 10: continue
+            cb(result=newLR(r))
+            self.oposs[ir] = spos
+        if self.debug:
+            print(f'{self.oposs=} {self.current_tokens.shape=}')
+
+    def end(self):
+        if self.debug:
+            print(f'finished: {self.current_tokens.shape=}')
+        results = self.batch_decode(self.current_tokens)
+        for r, op, cb, newLR in zip(results, self.oposs, self.wi_cbs, self.newLLMResult):
+            if len(r) == op: continue
+            cb(result=newLR(r[op:]))
+        del self.current_tokens
+        del self.wi_cbs
+
+class InfernLLMWorker(InfernBatchedWorker):
+    model_name = "Qwen/Qwen2.5-14B-Instruct"
+    model_cache_dir = f"/tmp/saved_model.{model_name}"
+    max_batch_size: int = 8
+    debug = True
+    llm_model: object
+    llm_tokenizer: object
+    output_sr: int
+
+    def __init__(self, device=None):
+        from warnings import filterwarnings
+        filterwarnings("ignore", category=FutureWarning)
+        filterwarnings("ignore", category=UserWarning)
+        from transformers import AutoTokenizer
+        from ipex_llm.transformers import AutoModelForCausalLM
+        super().__init__()
+        if device is None:
+            device = get_torch_hw()
+        def load_model(mn):
+            m = AutoModelForCausalLM.from_pretrained(mn, torch_dtype="auto",
+                    device_map="auto",
+                    optimize_model=True,
+                    trust_remote_code=True,
+                    load_in_4bit=True,
+                    use_cache=True
+                )
+            if mn != self.model_cache_dir:
+                m.save_low_bit(self.model_cache_dir)
+            return m.to(device)
+        if path_exists(self.model_cache_dir):
+            try:
+                model = AutoModelForCausalLM.load_low_bit(self.model_cache_dir,
+                                                          trust_remote_code=True)
+            except Exception:
+                model = load_model(self.model_name)
+        else:
+            model = load_model(self.model_name)
+        self.llm_model = model.to(device)
+        self.llm_tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+ 
+    def process_batch(self, wis:List[LLMInferRequest]):
+        if self.debug:
+            print(f'InfernLLMWorker.process_batch: got {len(wis)=}')
+        streamer = ResultsStreamer(wis, self)
+        with torch.no_grad():
+            messages = [self.llm_tokenizer.apply_chat_template(list(r.context), tokenize=False,
+                        add_generation_prompt=True)
+                        for r  in wis]
+            model_inputs = self.llm_tokenizer(messages, return_tensors="pt", padding=True).to(self.llm_model.device)
+            self.llm_model.generate(
+                **model_inputs,
+                max_new_tokens=16 * 1024,
+                output_scores=True,
+                return_dict_in_generate=True,
+                streamer=streamer,
+            )
+            torch.xpu.synchronize()
diff --git a/Cluster/LLMSession.py b/Cluster/LLMSession.py
@@ -0,0 +1,69 @@
+from typing import List, Tuple, Optional
+from time import monotonic
+from functools import partial
+from uuid import uuid4, UUID
+
+class LLMRequest():
+    id: UUID
+    text: str
+    textout_cb: callable
+    auto_ctx_add: bool = True
+    def __init__(self, text:str, textout_cb:callable):
+        self.text, self.textout_cb = text, textout_cb
+        self.id = uuid4()
+
+class LLMResult():
+    req_id: UUID
+    text: str
+    def __init__(self, text:str, req_id:UUID):
+        self.text, self.req_id = text, req_id
+
+class LLMInferRequest():
+    req: LLMRequest
+    context: Tuple[dict]
+    textout_cb: callable
+
+    def __init__(self, req:LLMRequest, context:List[dict]):
+        self.req, self.context = req, tuple(context)
+
+class LLMSession():
+    id: UUID
+    context: List[dict]
+    debug: bool = False
+    def __init__(self, llm):
+        self.id = uuid4()
+        self.context = [{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. " +
+                         "You are a helpful voice auto-attendant for the company Sippy Software. " +
+                         "Start by greeting the caller and asking how you can help. " +
+                         "Keep your messages brief and concise to reduce latency." +
+                         "The model output is fed into the dumb TTS system for audio output: DO not add any extended formatting."}]
+        self.llm = llm
+        
+    def context_add(self, content:str, role:str = "user"):
+        if self.debug:
+            print(f'{monotonic():4.3f}: LLMSession.context_add: {self.context=}, {content=}')
+        if len(self.context) > 0 and self.context[-1]["role"] == role:
+            self.context[-1]["content"] += f' {content}'
+        else:
+            self.context.append({"role": role, "content": content})
+
+    def textin(self, req:LLMRequest):
+        if self.debug:
+            print(f'{monotonic():4.3f}: LLMSession.textin: ${req.text=}, {req.textout_cb=} {self.context=}')
+        self.context_add(req.text)
+        ireq = LLMInferRequest(req, self.context)
+        if hasattr(req, '_proc_start_cb'):
+            ireq._proc_start_cb = req._proc_start_cb
+        ireq.textout_cb = partial(self.textout, req = req)
+        self.llm.infer(ireq)
+
+    def textout(self, req:LLMRequest, result:LLMResult):
+        if self.debug:
+            print(f'{monotonic():4.3f}: LLMSession.textout: {result.text=}')
+        if req.auto_ctx_add:
+            self.context_add(result.text, "assistant")
+        req.textout_cb(result = result)
+
+    def stop(self):
+        if self.debug: print('STTSession.stop')
+        del self.llm
diff --git a/examples/llm_test.py b/examples/llm_test.py
@@ -0,0 +1,48 @@
+import ray
+from sys import stderr
+from time import monotonic
+from uuid import UUID
+from functools import partial
+from time import sleep
+from Cluster.InfernLLMActor import InfernLLMActor
+from Cluster.LLMSession import LLMRequest
+
+#@ray.remote(resources={"head": 1})
+#class text_in(result):
+
+class TimedLLMRequest(LLMRequest):
+    queue_ts: float
+    proc_start_ts: float
+    def __init__(self, text:str, lms:UUID, lma:InfernLLMActor):
+        tin = partial(self.text_in, lms=lms, lma=lma)
+        super().__init__(text, tin)
+        self.queue_ts = monotonic()
+
+    def _proc_start_cb(self):
+        self.proc_start_ts = monotonic()
+
+    def text_in(self, result:str, lms:UUID, lma:InfernLLMActor):
+        from sys import stderr as _stderr
+        itime = monotonic() - self.proc_start_ts
+        print(f'text_in: got {result=}, inference time: {itime}', file=_stderr)
+        req = TimedLLMRequest('Hello, can I speak to the CEO?', lms, lma)
+        lma.llm_session_textin.remote(lms,  req)
+
+
+ray.init(num_gpus=2, resources = {'llm':1,'head':1})
+
+print('Initializing InfernLLMActor...', file=stderr)
+llm_actor = InfernLLMActor.remote()
+ray.get(llm_actor.start.remote())
+print('InfernLLMActor is ready', file=stderr)
+
+
+flms = [llm_actor.new_llm_session.remote() for _ in range(100)]
+print(f'Created {len(flms)} sessions', file=stderr)
+def sess(lms):
+    req = TimedLLMRequest('<Incoming call from "Doe Joe" +11233742223>', lms, llm_actor)
+    return llm_actor.llm_session_textin.remote(lms,  req)
+futs = [sess(lms) for lms in flms]
+for f in futs:
+    ray.get(f)
+sleep(3600)
diff --git a/examples/voice_ass.py b/examples/voice_ass.py
@@ -0,0 +1,34 @@
+import torch
+from transformers import AutoTokenizer, AutoConfig
+from ipex_llm.transformers import AutoModelForCausalLM
+from datetime import datetime
+
+model_name = "Qwen/Qwen2.5-Coder-14B-Instruct"
+#config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+#local_cache = f"~/.cache/Infernos/{model_name}"
+#config.save_pretrained(local_cache)
+
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto",
+device_map="auto",
+             load_in_4bit=True,
+             optimize_model=True,
+             trust_remote_code=True,
+             use_cache=True
+         )
+#model = model.half().to("xpu")
+model = model.to("xpu")
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-14B-Instruct")
+messages = [{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful voice auto-attendant for the company Sippy Software. Start by greeting the caller and asking how you can help. Try to keep your messages brief and concise to reduce latency."}, {"role": "system", "content": f'<Now is {datetime.now()}> <Incoming call from "Doe Joe" +11233742223>'}]
+text = tokenizer.apply_chat_template(messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+for i in range(10):
+    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+    generated_ids = model.generate(**model_inputs, max_new_tokens=16 * 1024, output_scores=True, return_dict_in_generate=True)
+    torch.xpu.synchronize()
+    generated_ids = [
+            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids.sequences)
+        ]
+    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    print(messages, response)