feat: retriever cookbook and utils change (#472)

Co-authored-by: Appointat <[email protected]>
camel-ai · Mar 27, 2024 · e080f25 · e080f25
1 parent c276ab3
commit e080f25
Show file tree

Hide file tree

Showing 14 changed files with 791 additions and 176 deletions.
diff --git a/camel/functions/__init__.py b/camel/functions/__init__.py
@@ -20,7 +20,7 @@
 from .math_functions import MATH_FUNCS
 from .search_functions import SEARCH_FUNCS
 from .weather_functions import WEATHER_FUNCS
-from .unstructured_io_fuctions import UnstructuredModules
+from ..loaders.unstructured_io import UnstructuredIO
 
 __all__ = [
     'OpenAIFunction',
@@ -29,5 +29,5 @@
     'MATH_FUNCS',
     'SEARCH_FUNCS',
     'WEATHER_FUNCS',
-    'UnstructuredModules',
+    'UnstructuredIO',
 ]
diff --git a/camel/loaders/__init__.py b/camel/loaders/__init__.py
@@ -0,0 +1,22 @@
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+
+from .base_io import File, read_file
+from .unstructured_io import UnstructuredIO
+
+__all__ = [
+    'File',
+    'read_file',
+    'UnstructuredIO',
+]
diff --git a/camel/functions/base_io_functions.py → camel/loaders/base_io.py b/camel/functions/base_io_functions.py → camel/loaders/base_io.py
@@ -35,10 +35,10 @@ def __init__(
         Args:
             name (str): The name of the file.
             id (str): The unique identifier of the file.
-            metadata (Dict[str, Any], optional):
-            Additional metadata associated with the file. Defaults to None.
-            docs (List[Dict[str, Any]], optional):
-            A list of documents contained within the file. Defaults to None.
+            metadata (Dict[str, Any], optional): Additional metadata
+                associated with the file. Defaults to None.
+            docs (List[Dict[str, Any]], optional): A list of documents
+                contained within the file. Defaults to None.
         """
         self.name = name
         self.id = id
@@ -51,8 +51,8 @@ def from_bytes(cls, file: BytesIO) -> "File":
         r"""Creates a File object from a BytesIO object.
 
         Args:
-            file (BytesIO):
-            A BytesIO object representing the contents of the file.
+            file (BytesIO): A BytesIO object representing the contents of the
+                file.
 
         Returns:
             File: A File object.
@@ -96,8 +96,8 @@ def from_bytes(cls, file: BytesIO) -> "DocxFile":
         r"""Creates a DocxFile object from a BytesIO object.
 
         Args:
-            file (BytesIO):
-            A BytesIO object representing the contents of the docx file.
+            file (BytesIO): A BytesIO object representing the contents of the
+                docx file.
 
         Returns:
             DocxFile: A DocxFile object.
@@ -127,8 +127,8 @@ def from_bytes(cls, file: BytesIO) -> "PdfFile":
         r"""Creates a PdfFile object from a BytesIO object.
 
         Args:
-            file (BytesIO):
-            A BytesIO object representing the contents of the pdf file.
+            file (BytesIO): A BytesIO object representing the contents of the
+                pdf file.
 
         Returns:
             PdfFile: A PdfFile object.
@@ -162,8 +162,8 @@ def from_bytes(cls, file: BytesIO) -> "TxtFile":
         r"""Creates a TxtFile object from a BytesIO object.
 
         Args:
-            file (BytesIO):
-            A BytesIO object representing the contents of the txt file.
+            file (BytesIO): A BytesIO object representing the contents of the
+                txt file.
 
         Returns:
             TxtFile: A TxtFile object.
@@ -187,8 +187,8 @@ def from_bytes(cls, file: BytesIO) -> "JsonFile":
         r"""Creates a JsonFile object from a BytesIO object.
 
         Args:
-            file (BytesIO):
-            A BytesIO object representing the contents of the json file.
+            file (BytesIO): A BytesIO object representing the contents of the
+                json file.
 
         Returns:
             JsonFile: A JsonFile object.
@@ -211,8 +211,8 @@ def from_bytes(cls, file: BytesIO) -> "HtmlFile":
         r"""Creates a HtmlFile object from a BytesIO object.
 
         Args:
-            file (BytesIO):
-            A BytesIO object representing the contents of the html file.
+            file (BytesIO): A BytesIO object representing the contents of the
+                html file.
 
         Returns:
             HtmlFile: A HtmlFile object.

diff --git a/camel/functions/unstructured_io_fuctions.py → camel/loaders/unstructured_io.py b/camel/functions/unstructured_io_fuctions.py → camel/loaders/unstructured_io.py
@@ -15,7 +15,7 @@
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union
 
 
-class UnstructuredModules:
+class UnstructuredIO:
     r"""A class to handle various functionalities provided by the
     Unstructured library, including version checking, parsing, cleaning,
     extracting, staging, chunking data, and integrating with cloud
@@ -29,13 +29,13 @@ class UnstructuredModules:
     UNSTRUCTURED_MIN_VERSION = "0.10.30"  # Define the minimum version
 
     def __init__(self):
-        r"""Initializes the UnstructuredModules class and ensures the
+        r"""Initializes the UnstructuredIO class and ensures the
         installed version of Unstructured library meets the minimum
         requirements.
         """
-        self.ensure_unstructured_version(self.UNSTRUCTURED_MIN_VERSION)
+        self._ensure_unstructured_version(self.UNSTRUCTURED_MIN_VERSION)
 
-    def ensure_unstructured_version(self, min_version: str) -> None:
+    def _ensure_unstructured_version(self, min_version: str) -> None:
         r"""Validates that the installed 'Unstructured' library version
         satisfies the specified minimum version requirement. This function is
         essential for ensuring compatibility with features that depend on a
@@ -80,7 +80,7 @@ def parse_file_or_url(
 
         Args:
             input_path (str): Path to the file or URL to be parsed.
-             **kwargs Extra kwargs passed to the partition function
+            **kwargs: Extra kwargs passed to the partition function.
 
         Returns:
             List[Any]: The elements after parsing the file or URL, could be a
@@ -147,10 +147,10 @@ def clean_text_data(
         clean_options: Optional[List[Tuple[str, Dict[str, Any]]]] = None,
     ) -> str:
         r"""Cleans text data using a variety of cleaning functions provided by
-        the `'unstructured'` library.
+        the `unstructured` library.
 
         This function applies multiple text cleaning utilities by calling the
-        `'unstructured'` library's cleaning bricks for operations like
+        `unstructured` library's cleaning bricks for operations like
         replacing unicode quotes, removing extra whitespace, dashes, non-ascii
         characters, and more.
 
@@ -161,36 +161,27 @@ def clean_text_data(
 
         Args:
             text (str): The text to be cleaned.
-            clean_options (dict): A dictionary specifying which
-                                cleaning options to apply. The keys should
-                                match the names of the cleaning functions,
-                                and the values should be dictionaries
-                                containing the parameters for each
-                                function. Supported types:
-                                'clean_extra_whitespace',
-                                'clean_bullets',
-                                'clean_ordered_bullets',
-                                'clean_postfix',
-                                'clean_prefix',
-                                'clean_dashes',
-                                'clean_trailing_punctuation',
-                                'clean_non_ascii_chars',
-                                'group_broken_paragraphs',
-                                'remove_punctuation',
-                                'replace_unicode_quotes',
-                                'bytes_string_to_string',
-                                'translate_text'.
+            clean_options (dict): A dictionary specifying which cleaning
+                options to apply. The keys should match the names of the
+                cleaning functions, and the values should be dictionaries
+                containing the parameters for each function. Supported types:
+                'clean_extra_whitespace', 'clean_bullets',
+                'clean_ordered_bullets', 'clean_postfix', 'clean_prefix',
+                'clean_dashes', 'clean_trailing_punctuation',
+                'clean_non_ascii_chars', 'group_broken_paragraphs',
+                'remove_punctuation', 'replace_unicode_quotes',
+                'bytes_string_to_string', 'translate_text'.
 
         Returns:
             str: The cleaned text.
 
         Raises:
             AttributeError: If a cleaning option does not correspond to a
-                valid cleaning function in 'unstructured'.
+                valid cleaning function in `unstructured`.
 
         Notes:
             The 'options' dictionary keys must correspond to valid cleaning
-            brick names from the 'unstructured' library.
+            brick names from the `unstructured` library.
             Each brick's parameters must be provided in a nested dictionary
             as the value for the key.
 
@@ -246,7 +237,7 @@ def clean_text_data(
                                                              **params)
             else:
                 raise ValueError(
-                    f"'{func_name}' is not a valid function in 'unstructured'."
+                    f"'{func_name}' is not a valid function in `unstructured`."
                 )
 
         return cleaned_text
@@ -267,15 +258,11 @@ def extract_data_from_text(
         Args:
             text (str): Text to extract data from.
             extract_type (Literal['extract_datetimetz',
-                                  'extract_email_address',
-                                  'extract_ip_address',
-                                  'extract_ip_address_name',
-                                  'extract_mapi_id',
-                                  'extract_ordered_bullets',
-                                  'extract_text_after',
-                                  'extract_text_before',
-                                  'extract_us_phone_number']):
-                                Type of data to extract.
+                'extract_email_address', 'extract_ip_address',
+                'extract_ip_address_name', 'extract_mapi_id',
+                'extract_ordered_bullets', 'extract_text_after',
+                'extract_text_before', 'extract_us_phone_number']): Type of
+                data to extract.
             **kwargs: Additional keyword arguments for specific
                 extraction functions.
 
@@ -337,18 +324,12 @@ def stage_elements(
 
         Args:
             elements (List[Any]): List of Element objects to be staged.
-            stage_type (Literal['convert_to_csv',
-                                'convert_to_dataframe',
-                                'convert_to_dict',
-                                'dict_to_elements',
-                                'stage_csv_for_prodigy',
-                                'stage_for_prodigy',
-                                'stage_for_baseplate',
-                                'stage_for_datasaur',
-                                'stage_for_label_box',
-                                'stage_for_label_studio',
-                                'stage_for_weaviate']):
-                Type of staging to perform.
+            stage_type (Literal['convert_to_csv', 'convert_to_dataframe',
+                'convert_to_dict', 'dict_to_elements',
+                'stage_csv_for_prodigy', 'stage_for_prodigy',
+                'stage_for_baseplate', 'stage_for_datasaur',
+                'stage_for_label_box', 'stage_for_label_studio',
+                'stage_for_weaviate']): Type of staging to perform.
             **kwargs: Additional keyword arguments specific to
                 the staging type.
 
@@ -413,7 +394,7 @@ def chunk_elements(self, elements: List[Any], chunk_type: str,
         Args:
             elements (List[Any]): List of Element objects to be chunked.
             chunk_type (str): Type chunk going to apply. Supported types:
-                            'chunk_by_title'.
+                'chunk_by_title'.
             **kwargs: Additional keyword arguments for chunking.
 
         Returns:

diff --git a/camel/retrievers/vector_retriever.py b/camel/retrievers/vector_retriever.py
@@ -14,7 +14,7 @@
 from typing import Any, Dict, List, Optional
 
 from camel.embeddings import BaseEmbedding, OpenAIEmbedding
-from camel.functions import UnstructuredModules
+from camel.functions import UnstructuredIO
 from camel.retrievers import BaseRetriever
 from camel.storages import BaseVectorStorage, VectorDBQuery, VectorRecord
 
@@ -58,7 +58,7 @@ def process(  # type: ignore
                 embeddings.
             **kwargs (Any): Additional keyword arguments for elements chunking.
         """
-        unstructured_modules = UnstructuredModules()
+        unstructured_modules = UnstructuredIO()
         elements = unstructured_modules.parse_file_or_url(content_input_path)
         chunks = unstructured_modules.chunk_elements(
             chunk_type="chunk_by_title", elements=elements, **kwargs)

diff --git a/camel/utils/__init__.py b/camel/utils/__init__.py
@@ -19,6 +19,7 @@
     download_tasks,
     get_task_list,
     check_server_running,
+    role_playing_with_function,
     get_system_information,
     to_pascal,
     PYDANTIC_V2,
@@ -45,4 +46,5 @@
     'BaseTokenCounter',
     'OpenAITokenCounter',
     'OpenSourceTokenCounter',
+    'role_playing_with_function',
 ]