From 508ac09d2d31cc0c7d53b35d8fdd23f845f7ee28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Louis=20Brul=C3=A9=20Naudet?= <louisbrulenaudet@icloud.com>
Date: Thu, 5 Sep 2024 23:11:12 +0200
Subject: [PATCH 1/7] Update llm_engine.py

- Added support for optional token and max_tokens parameters in the constructor.
- Provided usage examples and detailed documentation for each method.
---
 src/transformers/agents/llm_engine.py | 79 +++++++++++++++++++++++++--
 1 file changed, 73 insertions(+), 6 deletions(-)

diff --git a/src/transformers/agents/llm_engine.py b/src/transformers/agents/llm_engine.py
index 5c36c2922fa2..f9f41de3c175 100644
--- a/src/transformers/agents/llm_engine.py
+++ b/src/transformers/agents/llm_engine.py
@@ -68,25 +68,92 @@ def get_clean_message_list(message_list: List[Dict[str, str]], role_conversions:
 
 
 class HfApiEngine:
-    """This engine leverages Hugging Face's Inference API service, either serverless or with a dedicated endpoint."""
+    """
+    A class to interact with Hugging Face's Inference API for language model interaction.
+
+    This engine allows you to communicate with Hugging Face's models using the Inference API.
+    It can be used in both serverless mode or with a dedicated endpoint, supporting features 
+    like stop sequences and grammar customization.
 
-    def __init__(self, model: str = "meta-llama/Meta-Llama-3.1-8B-Instruct"):
+    Args:
+        model (str, optional): The Hugging Face model ID to be used for inference. This can be a path or model 
+            identifier from the Hugging Face model hub (default is "meta-llama/Meta-Llama-3.1-8B-Instruct").
+        token (str, optional): The Hugging Face API token for authentication. If not provided, the class will use
+            the token stored in the Hugging Face CLI configuration.
+        max_tokens (int, optional): The maximum number of tokens allowed in the output (default is 1500).
+        timeout (int, optional): Timeout for the API request, in seconds (default is 120).
+
+    Attributes:
+        model (str): The model ID being used for inference.
+        client (InferenceClient): The Hugging Face Inference API client for communicating with the language model.
+
+    Raises:
+        ValueError: If the model name is not provided.
+    """
+    def __init__(
+        self, 
+        model: str = "meta-llama/Meta-Llama-3.1-8B-Instruct", 
+        token: Optional[str] = None, 
+        max_tokens: int = 1500, 
+        timeout: int = 120
+    ):
+        """
+        Initializes the HfApiEngine.
+
+        Args:
+            model (str, optional): The Hugging Face model to use (default is 'meta-llama/Meta-Llama-3.1-8B-Instruct').
+            token (str, optional): The Hugging Face API token for authentication.
+            max_tokens (int, optional): The maximum number of tokens allowed in the response (default is 1500).
+            timeout (int, optional): The API request timeout, in seconds (default is 120).
+        """
+        if not model:
+            raise ValueError("Model name must be provided.")
+        
         self.model = model
-        self.client = InferenceClient(self.model, timeout=120)
+        self.client = InferenceClient(self.model, token=token, timeout=timeout)
+        self.max_tokens = max_tokens
 
     def __call__(
         self, messages: List[Dict[str, str]], stop_sequences: List[str] = [], grammar: Optional[str] = None
     ) -> str:
+        """
+        Processes the input messages and returns the model's response.
+
+        This method sends a list of messages to the Hugging Face Inference API, optionally 
+        with stop sequences and grammar customization.
+
+        Args:
+            messages (List[Dict[str, str]]): A list of message dictionaries to be processed. 
+                Each dictionary should have the structure {"role": "user/system", "content": "message content"}.
+            stop_sequences (List[str], optional): A list of strings that will stop the generation 
+                if encountered in the model's output. Defaults to an empty list.
+            grammar (str, optional): The grammar or formatting structure to use in the model's response. 
+                Default is None, which means no specific grammar.
+
+        Returns:
+            str: The text content of the model's response.
+
+        Examples:
+            >>> engine = HfApiEngine(
+            ...     model="meta-llama/Meta-Llama-3.1-8B-Instruct", 
+            ...     token="your_hf_token_here", 
+            ...     max_tokens=2000
+            ... )
+            >>> messages = [{"role": "user", "content": "Explain quantum mechanics in simple terms."}]
+            >>> response = engine(messages, stop_sequences=["END"])
+            >>> print(response)
+            "Quantum mechanics is the branch of physics that studies..."
+        """
         # Get clean message list
         messages = get_clean_message_list(messages, role_conversions=llama_role_conversions)
 
-        # Get LLM output
+        # Send messages to the Hugging Face Inference API
         if grammar is not None:
             response = self.client.chat_completion(
-                messages, stop=stop_sequences, max_tokens=1500, response_format=grammar
+                messages, stop=stop_sequences, max_tokens=self.max_tokens, response_format=grammar
             )
         else:
-            response = self.client.chat_completion(messages, stop=stop_sequences, max_tokens=1500)
+            response = self.client.chat_completion(messages, stop=stop_sequences, max_tokens=self.max_tokens)
 
         response = response.choices[0].message.content
 

From 479e6e1a8127164d47ed0c4d7d3bb43dc0c33375 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Louis=20Brul=C3=A9=20Naudet?= <louisbrulenaudet@icloud.com>
Date: Mon, 28 Oct 2024 00:15:50 +0100
Subject: [PATCH 2/7] Update llm_engine.py

Trailing whitespaces correction
---
 src/transformers/agents/llm_engine.py | 35 +++++++++++----------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/src/transformers/agents/llm_engine.py b/src/transformers/agents/llm_engine.py
index f9f41de3c175..ae923edb7302 100644
--- a/src/transformers/agents/llm_engine.py
+++ b/src/transformers/agents/llm_engine.py
@@ -72,14 +72,11 @@ class HfApiEngine:
     A class to interact with Hugging Face's Inference API for language model interaction.
 
     This engine allows you to communicate with Hugging Face's models using the Inference API.
-    It can be used in both serverless mode or with a dedicated endpoint, supporting features 
-    like stop sequences and grammar customization.
+    It can be used in both serverless mode or with a dedicated endpoint, supporting features like stop sequences and grammar customization.
 
     Args:
-        model (str, optional): The Hugging Face model ID to be used for inference. This can be a path or model 
-            identifier from the Hugging Face model hub (default is "meta-llama/Meta-Llama-3.1-8B-Instruct").
-        token (str, optional): The Hugging Face API token for authentication. If not provided, the class will use
-            the token stored in the Hugging Face CLI configuration.
+        model (str, optional): The Hugging Face model ID to be used for inference. This can be a path or model identifier from the Hugging Face model hub (default is "meta-llama/Meta-Llama-3.1-8B-Instruct").
+        token (str, optional): The Hugging Face API token for authentication. If not provided, the class will use the token stored in the Hugging Face CLI configuration.
         max_tokens (int, optional): The maximum number of tokens allowed in the output (default is 1500).
         timeout (int, optional): Timeout for the API request, in seconds (default is 120).
 
@@ -91,10 +88,10 @@ class HfApiEngine:
         ValueError: If the model name is not provided.
     """
     def __init__(
-        self, 
-        model: str = "meta-llama/Meta-Llama-3.1-8B-Instruct", 
-        token: Optional[str] = None, 
-        max_tokens: int = 1500, 
+        self,
+        model: str = "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        token: Optional[str] = None,
+        max_tokens: int = 1500,
         timeout: int = 120
     ):
         """
@@ -108,7 +105,7 @@ def __init__(
         """
         if not model:
             raise ValueError("Model name must be provided.")
-        
+
         self.model = model
         self.client = InferenceClient(self.model, token=token, timeout=timeout)
         self.max_tokens = max_tokens
@@ -119,24 +116,20 @@ def __call__(
         """
         Processes the input messages and returns the model's response.
 
-        This method sends a list of messages to the Hugging Face Inference API, optionally 
-        with stop sequences and grammar customization.
+        This method sends a list of messages to the Hugging Face Inference API, optionally with stop sequences and grammar customization.
 
         Args:
-            messages (List[Dict[str, str]]): A list of message dictionaries to be processed. 
-                Each dictionary should have the structure {"role": "user/system", "content": "message content"}.
-            stop_sequences (List[str], optional): A list of strings that will stop the generation 
-                if encountered in the model's output. Defaults to an empty list.
-            grammar (str, optional): The grammar or formatting structure to use in the model's response. 
-                Default is None, which means no specific grammar.
+            messages (List[Dict[str, str]]): A list of message dictionaries to be processed. Each dictionary should have the structure {"role": "user/system", "content": "message content"}.
+            stop_sequences (List[str], optional): A list of strings that will stop the generation if encountered in the model's output. Defaults to an empty list.
+            grammar (str, optional): The grammar or formatting structure to use in the model's response. Default is None, which means no specific grammar.
 
         Returns:
             str: The text content of the model's response.
 
         Examples:
             >>> engine = HfApiEngine(
-            ...     model="meta-llama/Meta-Llama-3.1-8B-Instruct", 
-            ...     token="your_hf_token_here", 
+            ...     model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+            ...     token="your_hf_token_here",
             ...     max_tokens=2000
             ... )
             >>> messages = [{"role": "user", "content": "Explain quantum mechanics in simple terms."}]

From 863b9ddd473d6df2b92d4e5990cc5b2d87bd30f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Louis=20Brul=C3=A9=20Naudet?= <louisbrulenaudet@icloud.com>
Date: Mon, 28 Oct 2024 00:23:24 +0100
Subject: [PATCH 3/7] Update llm_engine.py

Applying the Format Changes Automatically.
---
 src/transformers/agents/llm_engine.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/agents/llm_engine.py b/src/transformers/agents/llm_engine.py
index ae923edb7302..f7c082689c18 100644
--- a/src/transformers/agents/llm_engine.py
+++ b/src/transformers/agents/llm_engine.py
@@ -87,12 +87,13 @@ class HfApiEngine:
     Raises:
         ValueError: If the model name is not provided.
     """
+
     def __init__(
         self,
         model: str = "meta-llama/Meta-Llama-3.1-8B-Instruct",
         token: Optional[str] = None,
         max_tokens: int = 1500,
-        timeout: int = 120
+        timeout: int = 120,
     ):
         """
         Initializes the HfApiEngine.

From 0b242f9d7751b7f152d2add21f2cf49125af1995 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Louis=20Brul=C3=A9=20Naudet?= <louisbrulenaudet@icloud.com>
Date: Mon, 28 Oct 2024 00:40:14 +0100
Subject: [PATCH 4/7] Update llm_engine.py

---
 src/transformers/agents/llm_engine.py | 66 ++++++++++++++-------------
 1 file changed, 34 insertions(+), 32 deletions(-)

diff --git a/src/transformers/agents/llm_engine.py b/src/transformers/agents/llm_engine.py
index f7c082689c18..904ee52a3ba5 100644
--- a/src/transformers/agents/llm_engine.py
+++ b/src/transformers/agents/llm_engine.py
@@ -68,24 +68,27 @@ def get_clean_message_list(message_list: List[Dict[str, str]], role_conversions:
 
 
 class HfApiEngine:
-    """
-    A class to interact with Hugging Face's Inference API for language model interaction.
+    """A class to interact with Hugging Face's Inference API for language model interaction.
 
     This engine allows you to communicate with Hugging Face's models using the Inference API.
-    It can be used in both serverless mode or with a dedicated endpoint, supporting features like stop sequences and grammar customization.
-
-    Args:
-        model (str, optional): The Hugging Face model ID to be used for inference. This can be a path or model identifier from the Hugging Face model hub (default is "meta-llama/Meta-Llama-3.1-8B-Instruct").
-        token (str, optional): The Hugging Face API token for authentication. If not provided, the class will use the token stored in the Hugging Face CLI configuration.
-        max_tokens (int, optional): The maximum number of tokens allowed in the output (default is 1500).
-        timeout (int, optional): Timeout for the API request, in seconds (default is 120).
-
-    Attributes:
-        model (str): The model ID being used for inference.
-        client (InferenceClient): The Hugging Face Inference API client for communicating with the language model.
+    It can be used in both serverless mode or with a dedicated endpoint, supporting features
+    like stop sequences and grammar customization.
+
+    Parameters:
+        model (`str`, *optional*, defaults to "meta-llama/Meta-Llama-3.1-8B-Instruct"):
+            The Hugging Face model ID to be used for inference. This can be a path or model
+            identifier from the Hugging Face model hub.
+        token (`str`, *optional*):
+            The Hugging Face API token for authentication. If not provided, the class will
+            use the token stored in the Hugging Face CLI configuration.
+        max_tokens (`int`, *optional*, defaults to 1500):
+            The maximum number of tokens allowed in the output.
+        timeout (`int`, *optional*, defaults to 120):
+            Timeout for the API request, in seconds.
 
     Raises:
-        ValueError: If the model name is not provided.
+        ValueError:
+            If the model name is not provided.
     """
 
     def __init__(
@@ -95,15 +98,7 @@ def __init__(
         max_tokens: int = 1500,
         timeout: int = 120,
     ):
-        """
-        Initializes the HfApiEngine.
-
-        Args:
-            model (str, optional): The Hugging Face model to use (default is 'meta-llama/Meta-Llama-3.1-8B-Instruct').
-            token (str, optional): The Hugging Face API token for authentication.
-            max_tokens (int, optional): The maximum number of tokens allowed in the response (default is 1500).
-            timeout (int, optional): The API request timeout, in seconds (default is 120).
-        """
+        """Initialize the HfApiEngine."""
         if not model:
             raise ValueError("Model name must be provided.")
 
@@ -114,20 +109,26 @@ def __init__(
     def __call__(
         self, messages: List[Dict[str, str]], stop_sequences: List[str] = [], grammar: Optional[str] = None
     ) -> str:
-        """
-        Processes the input messages and returns the model's response.
+        """Process the input messages and return the model's response.
 
-        This method sends a list of messages to the Hugging Face Inference API, optionally with stop sequences and grammar customization.
+        This method sends a list of messages to the Hugging Face Inference API, optionally with
+        stop sequences and grammar customization.
 
-        Args:
-            messages (List[Dict[str, str]]): A list of message dictionaries to be processed. Each dictionary should have the structure {"role": "user/system", "content": "message content"}.
-            stop_sequences (List[str], optional): A list of strings that will stop the generation if encountered in the model's output. Defaults to an empty list.
-            grammar (str, optional): The grammar or formatting structure to use in the model's response. Default is None, which means no specific grammar.
+        Parameters:
+            messages (`List[Dict[str, str]]`):
+                A list of message dictionaries to be processed. Each dictionary should have
+                the structure `{"role": "user/system", "content": "message content"}`.
+            stop_sequences (`List[str]`, *optional*):
+                A list of strings that will stop the generation if encountered in the
+                model's output.
+            grammar (`str`, *optional*):
+                The grammar or formatting structure to use in the model's response.
 
         Returns:
-            str: The text content of the model's response.
+            `str`: The text content of the model's response.
 
-        Examples:
+        Example:
+            ```python
             >>> engine = HfApiEngine(
             ...     model="meta-llama/Meta-Llama-3.1-8B-Instruct",
             ...     token="your_hf_token_here",
@@ -137,6 +138,7 @@ def __call__(
             >>> response = engine(messages, stop_sequences=["END"])
             >>> print(response)
             "Quantum mechanics is the branch of physics that studies..."
+            ```
         """
         # Get clean message list
         messages = get_clean_message_list(messages, role_conversions=llama_role_conversions)

From 7fcbc19c96e3ae9eeb6978a2cfe7d06cc8facdad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Louis=20Brul=C3=A9=20Naudet?= <louisbrulenaudet@icloud.com>
Date: Mon, 28 Oct 2024 00:45:39 +0100
Subject: [PATCH 5/7] Update llm_engine.py

---
 src/transformers/agents/llm_engine.py | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/src/transformers/agents/llm_engine.py b/src/transformers/agents/llm_engine.py
index 904ee52a3ba5..895ac271cf63 100644
--- a/src/transformers/agents/llm_engine.py
+++ b/src/transformers/agents/llm_engine.py
@@ -70,17 +70,13 @@ def get_clean_message_list(message_list: List[Dict[str, str]], role_conversions:
 class HfApiEngine:
     """A class to interact with Hugging Face's Inference API for language model interaction.
 
-    This engine allows you to communicate with Hugging Face's models using the Inference API.
-    It can be used in both serverless mode or with a dedicated endpoint, supporting features
-    like stop sequences and grammar customization.
+    This engine allows you to communicate with Hugging Face's models using the Inference API. It can be used in both serverless mode or with a dedicated endpoint, supporting features like stop sequences and grammar customization.
 
     Parameters:
         model (`str`, *optional*, defaults to "meta-llama/Meta-Llama-3.1-8B-Instruct"):
-            The Hugging Face model ID to be used for inference. This can be a path or model
-            identifier from the Hugging Face model hub.
+            The Hugging Face model ID to be used for inference. This can be a path or model identifier from the Hugging Face model hub.
         token (`str`, *optional*):
-            The Hugging Face API token for authentication. If not provided, the class will
-            use the token stored in the Hugging Face CLI configuration.
+            The Hugging Face API token for authentication. If not provided, the class will use the token stored in the Hugging Face CLI configuration.
         max_tokens (`int`, *optional*, defaults to 1500):
             The maximum number of tokens allowed in the output.
         timeout (`int`, *optional*, defaults to 120):
@@ -111,16 +107,13 @@ def __call__(
     ) -> str:
         """Process the input messages and return the model's response.
 
-        This method sends a list of messages to the Hugging Face Inference API, optionally with
-        stop sequences and grammar customization.
+        This method sends a list of messages to the Hugging Face Inference API, optionally with stop sequences and grammar customization.
 
         Parameters:
             messages (`List[Dict[str, str]]`):
-                A list of message dictionaries to be processed. Each dictionary should have
-                the structure `{"role": "user/system", "content": "message content"}`.
+                A list of message dictionaries to be processed. Each dictionary should have the structure `{"role": "user/system", "content": "message content"}`.
             stop_sequences (`List[str]`, *optional*):
-                A list of strings that will stop the generation if encountered in the
-                model's output.
+                A list of strings that will stop the generation if encountered in the model's output.
             grammar (`str`, *optional*):
                 The grammar or formatting structure to use in the model's response.
 

From fd7935425931b8d17d7b0cef822bafa94813eca0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Louis=20Brul=C3=A9=20Naudet?= <louisbrulenaudet@icloud.com>
Date: Sat, 2 Nov 2024 13:04:38 +0100
Subject: [PATCH 6/7] Update llm_engine.py

---
 src/transformers/agents/llm_engine.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/transformers/agents/llm_engine.py b/src/transformers/agents/llm_engine.py
index 895ac271cf63..f15feaa11323 100644
--- a/src/transformers/agents/llm_engine.py
+++ b/src/transformers/agents/llm_engine.py
@@ -66,14 +66,13 @@ def get_clean_message_list(message_list: List[Dict[str, str]], role_conversions:
     MessageRole.TOOL_RESPONSE: MessageRole.USER,
 }
 
-
 class HfApiEngine:
     """A class to interact with Hugging Face's Inference API for language model interaction.
 
     This engine allows you to communicate with Hugging Face's models using the Inference API. It can be used in both serverless mode or with a dedicated endpoint, supporting features like stop sequences and grammar customization.
 
     Parameters:
-        model (`str`, *optional*, defaults to "meta-llama/Meta-Llama-3.1-8B-Instruct"):
+        model (`str`, *optional*, defaults to `"meta-llama/Meta-Llama-3.1-8B-Instruct"`):
             The Hugging Face model ID to be used for inference. This can be a path or model identifier from the Hugging Face model hub.
         token (`str`, *optional*):
             The Hugging Face API token for authentication. If not provided, the class will use the token stored in the Hugging Face CLI configuration.
@@ -103,7 +102,10 @@ def __init__(
         self.max_tokens = max_tokens
 
     def __call__(
-        self, messages: List[Dict[str, str]], stop_sequences: List[str] = [], grammar: Optional[str] = None
+        self,
+        messages: List[Dict[str, str]],
+        stop_sequences: Optional[List[str]] = [],
+        grammar: Optional[str] = None,
     ) -> str:
         """Process the input messages and return the model's response.
 
@@ -152,7 +154,6 @@ def __call__(
                 response = response[: -len(stop_seq)]
         return response
 
-
 class TransformersEngine:
     """This engine uses a pre-initialized local text-generation pipeline."""
 

From 7325f247bac9eec30852f04642b9c233bc5872d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Louis=20Brul=C3=A9=20Naudet?= <louisbrulenaudet@icloud.com>
Date: Sat, 2 Nov 2024 13:09:50 +0100
Subject: [PATCH 7/7] Update llm_engine.py

---
 src/transformers/agents/llm_engine.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/agents/llm_engine.py b/src/transformers/agents/llm_engine.py
index f15feaa11323..456c6172a77c 100644
--- a/src/transformers/agents/llm_engine.py
+++ b/src/transformers/agents/llm_engine.py
@@ -66,6 +66,7 @@ def get_clean_message_list(message_list: List[Dict[str, str]], role_conversions:
     MessageRole.TOOL_RESPONSE: MessageRole.USER,
 }
 
+
 class HfApiEngine:
     """A class to interact with Hugging Face's Inference API for language model interaction.
 
@@ -90,8 +91,8 @@ def __init__(
         self,
         model: str = "meta-llama/Meta-Llama-3.1-8B-Instruct",
         token: Optional[str] = None,
-        max_tokens: int = 1500,
-        timeout: int = 120,
+        max_tokens: Optional[int] = 1500,
+        timeout: Optional[int] = 120,
     ):
         """Initialize the HfApiEngine."""
         if not model:
@@ -154,6 +155,7 @@ def __call__(
                 response = response[: -len(stop_seq)]
         return response
 
+
 class TransformersEngine:
     """This engine uses a pre-initialized local text-generation pipeline."""