community: [bugfix] fix source path for office files in O365 (#28260)

# What problem are we fixing? Currently documents loaded using `O365BaseLoader` fetch source from `file.web_url` (where `file` is `<class 'O365.drive.File'>`). This works well for `.pdf` documents. Unfortunately office documents (`.xlsx`, `.docx` ...) pass their `web_url` in following format: `https://sharepoint_address/sites/path/to/library/root/Doc.aspx?sourcedoc=%XXXXXXXX-1111-1111-XXXX-XXXXXXXXXX%7D&file=filename.xlsx&action=default&mobileredirect=true` This obfuscates the path to the file. This PR utilizes the parrent folder's path and file name to reconstruct the actual location of the file. Knowing the file's location can be crucial for some RAG applications (path to the file can carry information we don't want to loose). @vbarda Could you please look at this one? I'm @-mentioning you since we've already closed some PRs together :-) Co-authored-by: Erick Friis <[email protected]>
langchain-ai · Dec 9, 2024 · 75bc6bb · 75bc6bb
1 parent 534b8f4
commit 75bc6bb
Showing 1 changed file with 22 additions and 2 deletions.
diff --git a/libs/community/langchain_community/document_loaders/base_o365.py b/libs/community/langchain_community/document_loaders/base_o365.py
@@ -5,7 +5,9 @@
 import logging
 import mimetypes
 import os
+import re
 import tempfile
+import urllib
 from abc import abstractmethod
 from pathlib import Path, PurePath
 from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union
@@ -186,9 +188,18 @@ def _load_from_folder(self, folder: Folder) -> Iterable[Blob]:
             for file in items:
                 if file.is_file:
                     if file.mime_type in list(file_mime_types.values()):
+                        source = file.web_url
+                        if re.search(
+                            r"Doc.aspx\?sourcedoc=.*file=([^&]+)", file.web_url
+                        ):
+                            source = (
+                                file._parent.web_url
+                                + "/"
+                                + urllib.parse.quote(file.name)
+                            )
                         file.download(to_path=temp_dir, chunk_size=self.chunk_size)
                         metadata_dict[file.name] = {
-                            "source": file.web_url,
+                            "source": source,
                             "mime_type": file.mime_type,
                             "created": str(file.created),
                             "modified": str(file.modified),
@@ -241,9 +252,18 @@ def _load_from_object_ids(
                     continue
                 if file.is_file:
                     if file.mime_type in list(file_mime_types.values()):
+                        source = file.web_url
+                        if re.search(
+                            r"Doc.aspx\?sourcedoc=.*file=([^&]+)", file.web_url
+                        ):
+                            source = (
+                                file._parent.web_url
+                                + "/"
+                                + urllib.parse.quote(file.name)
+                            )
                         file.download(to_path=temp_dir, chunk_size=self.chunk_size)
                         metadata_dict[file.name] = {
-                            "source": file.web_url,
+                            "source": source,
                             "mime_type": file.mime_type,
                             "created": file.created,
                             "modified": file.modified,