Skip to content

Commit

Permalink
feat: retriever cookbook and utils change (#472)
Browse files Browse the repository at this point in the history
Co-authored-by: Appointat <[email protected]>
  • Loading branch information
Wendong-Fan and Appointat authored Mar 27, 2024
1 parent c276ab3 commit e080f25
Show file tree
Hide file tree
Showing 14 changed files with 791 additions and 176 deletions.
4 changes: 2 additions & 2 deletions camel/functions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from .math_functions import MATH_FUNCS
from .search_functions import SEARCH_FUNCS
from .weather_functions import WEATHER_FUNCS
from .unstructured_io_fuctions import UnstructuredModules
from ..loaders.unstructured_io import UnstructuredIO

__all__ = [
'OpenAIFunction',
Expand All @@ -29,5 +29,5 @@
'MATH_FUNCS',
'SEARCH_FUNCS',
'WEATHER_FUNCS',
'UnstructuredModules',
'UnstructuredIO',
]
22 changes: 22 additions & 0 deletions camel/loaders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an “AS IS” BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========

from .base_io import File, read_file
from .unstructured_io import UnstructuredIO

__all__ = [
'File',
'read_file',
'UnstructuredIO',
]
32 changes: 16 additions & 16 deletions camel/functions/base_io_functions.py → camel/loaders/base_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ def __init__(
Args:
name (str): The name of the file.
id (str): The unique identifier of the file.
metadata (Dict[str, Any], optional):
Additional metadata associated with the file. Defaults to None.
docs (List[Dict[str, Any]], optional):
A list of documents contained within the file. Defaults to None.
metadata (Dict[str, Any], optional): Additional metadata
associated with the file. Defaults to None.
docs (List[Dict[str, Any]], optional): A list of documents
contained within the file. Defaults to None.
"""
self.name = name
self.id = id
Expand All @@ -51,8 +51,8 @@ def from_bytes(cls, file: BytesIO) -> "File":
r"""Creates a File object from a BytesIO object.
Args:
file (BytesIO):
A BytesIO object representing the contents of the file.
file (BytesIO): A BytesIO object representing the contents of the
file.
Returns:
File: A File object.
Expand Down Expand Up @@ -96,8 +96,8 @@ def from_bytes(cls, file: BytesIO) -> "DocxFile":
r"""Creates a DocxFile object from a BytesIO object.
Args:
file (BytesIO):
A BytesIO object representing the contents of the docx file.
file (BytesIO): A BytesIO object representing the contents of the
docx file.
Returns:
DocxFile: A DocxFile object.
Expand Down Expand Up @@ -127,8 +127,8 @@ def from_bytes(cls, file: BytesIO) -> "PdfFile":
r"""Creates a PdfFile object from a BytesIO object.
Args:
file (BytesIO):
A BytesIO object representing the contents of the pdf file.
file (BytesIO): A BytesIO object representing the contents of the
pdf file.
Returns:
PdfFile: A PdfFile object.
Expand Down Expand Up @@ -162,8 +162,8 @@ def from_bytes(cls, file: BytesIO) -> "TxtFile":
r"""Creates a TxtFile object from a BytesIO object.
Args:
file (BytesIO):
A BytesIO object representing the contents of the txt file.
file (BytesIO): A BytesIO object representing the contents of the
txt file.
Returns:
TxtFile: A TxtFile object.
Expand All @@ -187,8 +187,8 @@ def from_bytes(cls, file: BytesIO) -> "JsonFile":
r"""Creates a JsonFile object from a BytesIO object.
Args:
file (BytesIO):
A BytesIO object representing the contents of the json file.
file (BytesIO): A BytesIO object representing the contents of the
json file.
Returns:
JsonFile: A JsonFile object.
Expand All @@ -211,8 +211,8 @@ def from_bytes(cls, file: BytesIO) -> "HtmlFile":
r"""Creates a HtmlFile object from a BytesIO object.
Args:
file (BytesIO):
A BytesIO object representing the contents of the html file.
file (BytesIO): A BytesIO object representing the contents of the
html file.
Returns:
HtmlFile: A HtmlFile object.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from typing import Any, Dict, List, Literal, Optional, Tuple, Union


class UnstructuredModules:
class UnstructuredIO:
r"""A class to handle various functionalities provided by the
Unstructured library, including version checking, parsing, cleaning,
extracting, staging, chunking data, and integrating with cloud
Expand All @@ -29,13 +29,13 @@ class UnstructuredModules:
UNSTRUCTURED_MIN_VERSION = "0.10.30" # Define the minimum version

def __init__(self):
r"""Initializes the UnstructuredModules class and ensures the
r"""Initializes the UnstructuredIO class and ensures the
installed version of Unstructured library meets the minimum
requirements.
"""
self.ensure_unstructured_version(self.UNSTRUCTURED_MIN_VERSION)
self._ensure_unstructured_version(self.UNSTRUCTURED_MIN_VERSION)

def ensure_unstructured_version(self, min_version: str) -> None:
def _ensure_unstructured_version(self, min_version: str) -> None:
r"""Validates that the installed 'Unstructured' library version
satisfies the specified minimum version requirement. This function is
essential for ensuring compatibility with features that depend on a
Expand Down Expand Up @@ -80,7 +80,7 @@ def parse_file_or_url(
Args:
input_path (str): Path to the file or URL to be parsed.
**kwargs Extra kwargs passed to the partition function
**kwargs: Extra kwargs passed to the partition function.
Returns:
List[Any]: The elements after parsing the file or URL, could be a
Expand Down Expand Up @@ -147,10 +147,10 @@ def clean_text_data(
clean_options: Optional[List[Tuple[str, Dict[str, Any]]]] = None,
) -> str:
r"""Cleans text data using a variety of cleaning functions provided by
the `'unstructured'` library.
the `unstructured` library.
This function applies multiple text cleaning utilities by calling the
`'unstructured'` library's cleaning bricks for operations like
`unstructured` library's cleaning bricks for operations like
replacing unicode quotes, removing extra whitespace, dashes, non-ascii
characters, and more.
Expand All @@ -161,36 +161,27 @@ def clean_text_data(
Args:
text (str): The text to be cleaned.
clean_options (dict): A dictionary specifying which
cleaning options to apply. The keys should
match the names of the cleaning functions,
and the values should be dictionaries
containing the parameters for each
function. Supported types:
'clean_extra_whitespace',
'clean_bullets',
'clean_ordered_bullets',
'clean_postfix',
'clean_prefix',
'clean_dashes',
'clean_trailing_punctuation',
'clean_non_ascii_chars',
'group_broken_paragraphs',
'remove_punctuation',
'replace_unicode_quotes',
'bytes_string_to_string',
'translate_text'.
clean_options (dict): A dictionary specifying which cleaning
options to apply. The keys should match the names of the
cleaning functions, and the values should be dictionaries
containing the parameters for each function. Supported types:
'clean_extra_whitespace', 'clean_bullets',
'clean_ordered_bullets', 'clean_postfix', 'clean_prefix',
'clean_dashes', 'clean_trailing_punctuation',
'clean_non_ascii_chars', 'group_broken_paragraphs',
'remove_punctuation', 'replace_unicode_quotes',
'bytes_string_to_string', 'translate_text'.
Returns:
str: The cleaned text.
Raises:
AttributeError: If a cleaning option does not correspond to a
valid cleaning function in 'unstructured'.
valid cleaning function in `unstructured`.
Notes:
The 'options' dictionary keys must correspond to valid cleaning
brick names from the 'unstructured' library.
brick names from the `unstructured` library.
Each brick's parameters must be provided in a nested dictionary
as the value for the key.
Expand Down Expand Up @@ -246,7 +237,7 @@ def clean_text_data(
**params)
else:
raise ValueError(
f"'{func_name}' is not a valid function in 'unstructured'."
f"'{func_name}' is not a valid function in `unstructured`."
)

return cleaned_text
Expand All @@ -267,15 +258,11 @@ def extract_data_from_text(
Args:
text (str): Text to extract data from.
extract_type (Literal['extract_datetimetz',
'extract_email_address',
'extract_ip_address',
'extract_ip_address_name',
'extract_mapi_id',
'extract_ordered_bullets',
'extract_text_after',
'extract_text_before',
'extract_us_phone_number']):
Type of data to extract.
'extract_email_address', 'extract_ip_address',
'extract_ip_address_name', 'extract_mapi_id',
'extract_ordered_bullets', 'extract_text_after',
'extract_text_before', 'extract_us_phone_number']): Type of
data to extract.
**kwargs: Additional keyword arguments for specific
extraction functions.
Expand Down Expand Up @@ -337,18 +324,12 @@ def stage_elements(
Args:
elements (List[Any]): List of Element objects to be staged.
stage_type (Literal['convert_to_csv',
'convert_to_dataframe',
'convert_to_dict',
'dict_to_elements',
'stage_csv_for_prodigy',
'stage_for_prodigy',
'stage_for_baseplate',
'stage_for_datasaur',
'stage_for_label_box',
'stage_for_label_studio',
'stage_for_weaviate']):
Type of staging to perform.
stage_type (Literal['convert_to_csv', 'convert_to_dataframe',
'convert_to_dict', 'dict_to_elements',
'stage_csv_for_prodigy', 'stage_for_prodigy',
'stage_for_baseplate', 'stage_for_datasaur',
'stage_for_label_box', 'stage_for_label_studio',
'stage_for_weaviate']): Type of staging to perform.
**kwargs: Additional keyword arguments specific to
the staging type.
Expand Down Expand Up @@ -413,7 +394,7 @@ def chunk_elements(self, elements: List[Any], chunk_type: str,
Args:
elements (List[Any]): List of Element objects to be chunked.
chunk_type (str): Type chunk going to apply. Supported types:
'chunk_by_title'.
'chunk_by_title'.
**kwargs: Additional keyword arguments for chunking.
Returns:
Expand Down
4 changes: 2 additions & 2 deletions camel/retrievers/vector_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from typing import Any, Dict, List, Optional

from camel.embeddings import BaseEmbedding, OpenAIEmbedding
from camel.functions import UnstructuredModules
from camel.functions import UnstructuredIO
from camel.retrievers import BaseRetriever
from camel.storages import BaseVectorStorage, VectorDBQuery, VectorRecord

Expand Down Expand Up @@ -58,7 +58,7 @@ def process( # type: ignore
embeddings.
**kwargs (Any): Additional keyword arguments for elements chunking.
"""
unstructured_modules = UnstructuredModules()
unstructured_modules = UnstructuredIO()
elements = unstructured_modules.parse_file_or_url(content_input_path)
chunks = unstructured_modules.chunk_elements(
chunk_type="chunk_by_title", elements=elements, **kwargs)
Expand Down
2 changes: 2 additions & 0 deletions camel/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
download_tasks,
get_task_list,
check_server_running,
role_playing_with_function,
get_system_information,
to_pascal,
PYDANTIC_V2,
Expand All @@ -45,4 +46,5 @@
'BaseTokenCounter',
'OpenAITokenCounter',
'OpenSourceTokenCounter',
'role_playing_with_function',
]
Loading

0 comments on commit e080f25

Please sign in to comment.