Implement Token Usage Tracking (#27)

ZachZimm · Blaizzy · web-flow · commit cd199d834ea3 · 2024-10-29T23:31:52.000+01:00
* Added token usage tracking in accordance with OpenAI API spec

* Removed some unneeded, commented out code

* Added optional  dict to  with  option.

* Removed extraneous comment

* Fixed indentation error in lm_stream_generator during final chunk creation and send

* updated tests

---------

Co-authored-by: Prince Canuma &lt;prince.gdt@gmail.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 .DS_Store
 __pycache__
 *.egg-info
-venv/*
+env/
+venv/*
diff --git a/fastmlx/fastmlx.py b/fastmlx/fastmlx.py
@@ -19,6 +19,7 @@
     ChatCompletionRequest,
     ChatCompletionResponse,
     ChatMessage,
+    Usage,
 )
 from .types.model import SupportedModels
 
@@ -180,6 +181,7 @@ async def chat_completion(request: ChatCompletionRequest):
                     image_processor,
                     request.max_tokens,
                     request.temperature,
+                    stream_options=request.stream_options,
                 ),
                 media_type="text/event-stream",
             )
@@ -235,11 +237,12 @@ async def chat_completion(request: ChatCompletionRequest):
                     request.max_tokens,
                     request.temperature,
                     stop_words=stop_words,
+                    stream_options=request.stream_options,
                 ),
                 media_type="text/event-stream",
             )
         else:
-            output = lm_generate(
+            output, token_length_info = lm_generate(
                 model,
                 tokenizer,
                 prompt,
@@ -249,7 +252,7 @@ async def chat_completion(request: ChatCompletionRequest):
             )
 
     # Parse the output to check for function calls
-    return handle_function_calls(output, request)
+    return handle_function_calls(output, request, token_length_info)
 
 
 @app.get("/v1/supported_models", response_model=SupportedModels)
diff --git a/fastmlx/types/chat/chat_completion.py b/fastmlx/types/chat/chat_completion.py
@@ -31,6 +31,12 @@ class ChatMessage(BaseModel):
     content: Union[str, List[ChatCompletionContentPartParam]]
 
 
+class Usage(BaseModel):
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+
+
 class ChatCompletionRequest(BaseModel):
     model: str
     messages: List[ChatMessage]
@@ -40,13 +46,15 @@ class ChatCompletionRequest(BaseModel):
     temperature: Optional[float] = Field(default=0.2)
     tools: Optional[List[Function]] = Field(default=None)
     tool_choice: Optional[str] = Field(default=None)
+    stream_options: Optional[Dict[str, Any]] = Field(default=None)
 
 
 class ChatCompletionResponse(BaseModel):
     id: str
     object: str = "chat.completion"
     created: int
     model: str
+    usage: Usage
     choices: List[dict]
     tool_calls: Optional[List[ToolCall]] = None
 
@@ -57,3 +65,4 @@ class ChatCompletionChunk(BaseModel):
     created: int
     model: str
     choices: List[Dict[str, Any]]
+    usage: Optional[Usage] = None
diff --git a/fastmlx/utils.py b/fastmlx/utils.py
@@ -14,6 +14,7 @@
     ChatCompletionResponse,
     FunctionCall,
     ToolCall,
+    Usage,
 )
 
 # MLX Imports
@@ -161,7 +162,9 @@ def apply_lm_chat_template(
         return request.messages[-1].content
 
 
-def handle_function_calls(output: str, request):
+def handle_function_calls(
+    output: str, request: ChatCompletionRequest, token_info: Usage
+) -> ChatCompletionResponse:
     tool_calls = []
 
     # Check for JSON format tool calls
@@ -264,6 +267,7 @@ def handle_function_calls(output: str, request):
         id=f"chatcmpl-{os.urandom(4).hex()}",
         created=int(time.time()),
         model=request.model,
+        usage=token_info,
         choices=[
             {
                 "index": 0,
@@ -290,7 +294,9 @@ def load_vlm_model(model_name: str, config: Dict[str, Any]) -> Dict[str, Any]:
 
 
 def load_lm_model(model_name: str, config: Dict[str, Any]) -> Dict[str, Any]:
-    model, tokenizer = lm_load(model_name)
+    time_start = time.time()
+    model, tokenizer = lm_load(model_name, model_config=config)
+    print(f"Model loaded in {time.time() - time_start:.2f} seconds.")
     return {"model": model, "tokenizer": tokenizer, "config": config}
 
 
@@ -303,7 +309,15 @@ def vlm_stream_generator(
     image_processor,
     max_tokens,
     temperature,
+    stream_options,
 ):
+    INCLUDE_USAGE = (
+        False if stream_options == None else stream_options.get("include_usage", False)
+    )
+    completion_tokens = 0
+    prompt_tokens = len(mx.array(processor.encode(prompt))) if INCLUDE_USAGE else None
+    empty_usage: Usage = None
+
     for token in vlm_stream_generate(
         model,
         processor,
@@ -313,10 +327,15 @@ def vlm_stream_generator(
         max_tokens=max_tokens,
         temp=temperature,
     ):
+        # Update token length info
+        if INCLUDE_USAGE:
+            completion_tokens += 1
+
         chunk = ChatCompletionChunk(
             id=f"chatcmpl-{os.urandom(4).hex()}",
             created=int(time.time()),
             model=model_name,
+            usage=empty_usage,
             choices=[
                 {
                     "index": 0,
@@ -326,6 +345,20 @@ def vlm_stream_generator(
             ],
         )
         yield f"data: {json.dumps(chunk.model_dump())}\n\n"
+
+    if INCLUDE_USAGE:
+        chunk = ChatCompletionChunk(
+            id=f"chatcmpl-{os.urandom(4).hex()}",
+            created=int(time.time()),
+            model=model_name,
+            choices=[],
+            usage=Usage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=prompt_tokens + completion_tokens,
+            ),
+        )
+        yield f"data: {json.dumps(chunk.model_dump())}\n\n"
     yield "data: [DONE]\n\n"
 
 
@@ -361,6 +394,7 @@ def lm_generate(
     )
 
     prompt_tokens = mx.array(tokenizer.encode(prompt))
+    prompt_token_len = len(prompt_tokens)
     detokenizer = tokenizer.detokenizer
 
     detokenizer.reset()
@@ -377,24 +411,49 @@ def lm_generate(
         detokenizer.add_token(token)
 
     detokenizer.finalize()
-    return detokenizer.text
+
+    _completion_tokens = len(detokenizer.tokens)
+    token_length_info: Usage = Usage(
+        prompt_tokens=prompt_token_len,
+        completion_tokens=_completion_tokens,
+        total_tokens=prompt_token_len + _completion_tokens,
+    )
+    return detokenizer.text, token_length_info
 
 
 def lm_stream_generator(
-    model, model_name, tokenizer, prompt, max_tokens, temperature, **kwargs
+    model,
+    model_name,
+    tokenizer,
+    prompt,
+    max_tokens,
+    temperature,
+    stream_options,
+    **kwargs,
 ):
     stop_words = kwargs.pop("stop_words", [])
+    INCLUDE_USAGE = (
+        False if stream_options == None else stream_options.get("include_usage", False)
+    )
+    prompt_tokens = len(tokenizer.encode(prompt)) if INCLUDE_USAGE else None
+    completion_tokens = 0
+    empty_usage: Usage = None
 
     for token in lm_stream_generate(
         model, tokenizer, prompt, max_tokens=max_tokens, temp=temperature
     ):
         if stop_words and token in stop_words:
             break
 
+        # Update token length info
+        if INCLUDE_USAGE:
+            completion_tokens += 1
+
         chunk = ChatCompletionChunk(
             id=f"chatcmpl-{os.urandom(4).hex()}",
             created=int(time.time()),
             model=model_name,
+            usage=empty_usage,
             choices=[
                 {
                     "index": 0,
@@ -405,4 +464,18 @@ def lm_stream_generator(
         )
         yield f"data: {json.dumps(chunk.model_dump())}\n\n"
 
+    if INCLUDE_USAGE:
+        chunk = ChatCompletionChunk(
+            id=f"chatcmpl-{os.urandom(4).hex()}",
+            created=int(time.time()),
+            model=model_name,
+            choices=[],
+            usage=Usage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=prompt_tokens + completion_tokens,
+            ),
+        )
+        yield f"data: {json.dumps(chunk.model_dump())}\n\n"
+
     yield "data: [DONE]\n\n"
diff --git a/tests/test_fastmlx.py b/tests/test_fastmlx.py