Community: add modified_since argument to O365BaseLoader (#28708)

## What are we doing in this PR We're adding `modified_since` optional argument to `O365BaseLoader`. When set, O365 loader will only load documents newer than `modified_since` datetime. ## Why? OneDrives / Sharepoints can contain large number of documents. Current approach is to download and parse all files and let indexer to deal with duplicates. This can be prohibitively time-consuming. Especially when using OCR-based parser like [zerox](https://github.com/langchain-ai/langchain/blob/fa0618883493cf6a1447a73b66cd10c0f028e09b/libs/community/langchain_community/document_loaders/pdf.py#L948). This argument allows to skip documents that are older than known time of indexing. _Q: What if a file was modfied during last indexing process? A: Users can set the `modified_since` conservatively and indexer will still take care of duplicates._ If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --------- Co-authored-by: Erick Friis <[email protected]>
langchain-ai · Dec 13, 2024 · 05ebe1e · 05ebe1e
1 parent c855d43
commit 05ebe1e
Showing 1 changed file with 26 additions and 19 deletions.
diff --git a/libs/community/langchain_community/document_loaders/base_o365.py b/libs/community/langchain_community/document_loaders/base_o365.py
@@ -9,6 +9,7 @@
 import tempfile
 import urllib
 from abc import abstractmethod
+from datetime import datetime
 from pathlib import Path, PurePath
 from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union
 
@@ -86,6 +87,9 @@ class O365BaseLoader(BaseLoader, BaseModel):
     """Number of bytes to retrieve from each api call to the server. int or 'auto'."""
     recursive: bool = False
     """Should the loader recursively load subfolders?"""
+    modified_since: Optional[datetime] = None
+    """Only fetch documents modified since given datetime. The datetime object
+    must be timezone aware."""
     handlers: Optional[Dict[str, Any]] = {}
     """
     Provide custom handlers for MimeTypeBasedParser.
@@ -188,26 +192,29 @@ def _load_from_folder(self, folder: Folder) -> Iterable[Blob]:
             for file in items:
                 if file.is_file:
                     if file.mime_type in list(file_mime_types.values()):
-                        source = file.web_url
-                        if re.search(
-                            r"Doc.aspx\?sourcedoc=.*file=([^&]+)", file.web_url
+                        if (not self.modified_since) or (
+                            file.modified > self.modified_since
                         ):
-                            source = (
-                                file._parent.web_url
-                                + "/"
-                                + urllib.parse.quote(file.name)
-                            )
-                        file.download(to_path=temp_dir, chunk_size=self.chunk_size)
-                        metadata_dict[file.name] = {
-                            "source": source,
-                            "mime_type": file.mime_type,
-                            "created": str(file.created),
-                            "modified": str(file.modified),
-                            "created_by": str(file.created_by),
-                            "modified_by": str(file.modified_by),
-                            "description": file.description,
-                            "id": str(file.object_id),
-                        }
+                            source = file.web_url
+                            if re.search(
+                                r"Doc.aspx\?sourcedoc=.*file=([^&]+)", file.web_url
+                            ):
+                                source = (
+                                    file._parent.web_url
+                                    + "/"
+                                    + urllib.parse.quote(file.name)
+                                )
+                            file.download(to_path=temp_dir, chunk_size=self.chunk_size)
+                            metadata_dict[file.name] = {
+                                "source": source,
+                                "mime_type": file.mime_type,
+                                "created": str(file.created),
+                                "modified": str(file.modified),
+                                "created_by": str(file.created_by),
+                                "modified_by": str(file.modified_by),
+                                "description": file.description,
+                                "id": str(file.object_id),
+                            }
 
             loader = FileSystemBlobLoader(path=temp_dir)
             for blob in loader.yield_blobs():