From 50bdebcdc67a6b415071546fe11fe4c044bda9ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20T=C5=99=C3=ADska?= Date: Fri, 13 Dec 2024 11:46:30 +0100 Subject: [PATCH 1/2] added modified_since to 0365 loader --- .../document_loaders/base_o365.py | 46 +++++++++++-------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/base_o365.py b/libs/community/langchain_community/document_loaders/base_o365.py index 981a637cbb3b1..2e3ef112b1360 100644 --- a/libs/community/langchain_community/document_loaders/base_o365.py +++ b/libs/community/langchain_community/document_loaders/base_o365.py @@ -9,9 +9,11 @@ import tempfile import urllib from abc import abstractmethod +from datetime import datetime from pathlib import Path, PurePath from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union +from dateutil.parser import parse from pydantic import ( BaseModel, Field, @@ -86,6 +88,9 @@ class O365BaseLoader(BaseLoader, BaseModel): """Number of bytes to retrieve from each api call to the server. int or 'auto'.""" recursive: bool = False """Should the loader recursively load subfolders?""" + modified_since: Optional[datetime] = None + """Only fetch documents modified since given datetime. The datetime object + must be timezone aware.""" handlers: Optional[Dict[str, Any]] = {} """ Provide custom handlers for MimeTypeBasedParser. @@ -188,26 +193,29 @@ def _load_from_folder(self, folder: Folder) -> Iterable[Blob]: for file in items: if file.is_file: if file.mime_type in list(file_mime_types.values()): - source = file.web_url - if re.search( - r"Doc.aspx\?sourcedoc=.*file=([^&]+)", file.web_url + if (not self.modified_since) or ( + file.modified > self.modified_since ): - source = ( - file._parent.web_url - + "/" - + urllib.parse.quote(file.name) - ) - file.download(to_path=temp_dir, chunk_size=self.chunk_size) - metadata_dict[file.name] = { - "source": source, - "mime_type": file.mime_type, - "created": str(file.created), - "modified": str(file.modified), - "created_by": str(file.created_by), - "modified_by": str(file.modified_by), - "description": file.description, - "id": str(file.object_id), - } + source = file.web_url + if re.search( + r"Doc.aspx\?sourcedoc=.*file=([^&]+)", file.web_url + ): + source = ( + file._parent.web_url + + "/" + + urllib.parse.quote(file.name) + ) + file.download(to_path=temp_dir, chunk_size=self.chunk_size) + metadata_dict[file.name] = { + "source": source, + "mime_type": file.mime_type, + "created": str(file.created), + "modified": str(file.modified), + "created_by": str(file.created_by), + "modified_by": str(file.modified_by), + "description": file.description, + "id": str(file.object_id), + } loader = FileSystemBlobLoader(path=temp_dir) for blob in loader.yield_blobs(): From 482f73824cad17ee64a6d6fd662f877ace33940e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20T=C5=99=C3=ADska?= Date: Fri, 13 Dec 2024 11:58:37 +0100 Subject: [PATCH 2/2] linting --- libs/community/langchain_community/document_loaders/base_o365.py | 1 - 1 file changed, 1 deletion(-) diff --git a/libs/community/langchain_community/document_loaders/base_o365.py b/libs/community/langchain_community/document_loaders/base_o365.py index 2e3ef112b1360..4cd341fadde19 100644 --- a/libs/community/langchain_community/document_loaders/base_o365.py +++ b/libs/community/langchain_community/document_loaders/base_o365.py @@ -13,7 +13,6 @@ from pathlib import Path, PurePath from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union -from dateutil.parser import parse from pydantic import ( BaseModel, Field,