Skip to content

Commit 3fb9dd0

Browse files
github-actions[bot]maxesseartem-shelkovnikovelasticmachine
authored
[9.2] fix: Exclude /contentstorage/ URLs from Sharepoint Online Connector (#3630) (#3791)
Backports the following commits to 9.2: - fix: Exclude /contentstorage/ URLs from Sharepoint Online Connector (#3630) --------- Co-authored-by: Max Sanna <[email protected]> Co-authored-by: Artem Shelkovnikov <[email protected]> Co-authored-by: Elastic Machine <[email protected]>
1 parent 55869be commit 3fb9dd0

File tree

2 files changed

+54
-10
lines changed

2 files changed

+54
-10
lines changed

NOTICE.txt

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1880,7 +1880,7 @@ SOFTWARE.
18801880

18811881

18821882
azure-core
1883-
1.35.1
1883+
1.36.0
18841884
MIT License
18851885
Copyright (c) Microsoft Corporation.
18861886

@@ -2606,7 +2606,7 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
26062606

26072607

26082608
cachetools
2609-
6.2.0
2609+
6.2.1
26102610
MIT License
26112611
The MIT License (MIT)
26122612

@@ -2687,7 +2687,7 @@ documentation is licensed as follows:
26872687

26882688

26892689
charset-normalizer
2690-
3.4.3
2690+
3.4.4
26912691
MIT
26922692
MIT License
26932693

@@ -5100,11 +5100,11 @@ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
51005100

51015101

51025102
idna
5103-
3.10
5104-
BSD License
5103+
3.11
5104+
UNKNOWN
51055105
BSD 3-Clause License
51065106

5107-
Copyright (c) 2013-2024, Kim Davies and contributors.
5107+
Copyright (c) 2013-2025, Kim Davies and contributors.
51085108
All rights reserved.
51095109

51105110
Redistribution and use in source and binary forms, with or without
@@ -5977,7 +5977,7 @@ BSD
59775977
UNKNOWN
59785978

59795979
propcache
5980-
0.4.0
5980+
0.4.1
59815981
Apache Software License
59825982

59835983
Apache License

connectors/sources/sharepoint_online.py

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,19 @@
7575
WILDCARD = "*"
7676
DRIVE_ITEMS_FIELDS = "id,content.downloadUrl,lastModifiedDateTime,lastModifiedBy,root,deleted,file,folder,package,name,webUrl,createdBy,createdDateTime,size,parentReference"
7777

78+
# Exclude specific SharePoint paths entirely at the connector level (pre sync-rules)
79+
EXCLUDED_SHAREPOINT_PATH_SEGMENTS = ["/contentstorage/"]
80+
81+
82+
def _is_excluded_sharepoint_url(url: str) -> bool:
83+
try:
84+
return any(
85+
segment in url.lower() for segment in EXCLUDED_SHAREPOINT_PATH_SEGMENTS
86+
)
87+
except Exception:
88+
return False
89+
90+
7891
CURSOR_SITE_DRIVE_KEY = "site_drives"
7992

8093
# Microsoft Graph API Delta constants
@@ -784,6 +797,11 @@ async def sites(
784797
if allowed_root_sites == [WILDCARD] or enumerate_all_sites:
785798
self._logger.debug(f"Looking up all sites to fetch: {allowed_root_sites}")
786799
async for site in self._all_sites(sharepoint_host, allowed_root_sites):
800+
if _is_excluded_sharepoint_url(site.get("webUrl", "")):
801+
self._logger.debug(
802+
f"Skipping excluded SharePoint site: {site.get('webUrl', site.get('id', 'unknown'))}"
803+
)
804+
continue
787805
yield site
788806
else:
789807
self._logger.debug(f"Looking up individual sites: {allowed_root_sites}")
@@ -793,9 +811,20 @@ async def sites(
793811
async for site in self._fetch_site_and_subsites_by_path(
794812
sharepoint_host, allowed_site
795813
):
814+
if _is_excluded_sharepoint_url(site.get("webUrl", "")):
815+
self._logger.debug(
816+
f"Skipping excluded SharePoint site: {site.get('webUrl', site.get('id', 'unknown'))}"
817+
)
818+
continue
796819
yield site
797820
else:
798-
yield await self._fetch_site(sharepoint_host, allowed_site)
821+
site_obj = await self._fetch_site(sharepoint_host, allowed_site)
822+
if _is_excluded_sharepoint_url(site_obj.get("webUrl", "")):
823+
self._logger.debug(
824+
f"Skipping excluded SharePoint site: {site_obj.get('webUrl', site_obj.get('id', 'unknown'))}"
825+
)
826+
continue
827+
yield site_obj
799828

800829
except NotFound:
801830
self._logger.warning(
@@ -852,8 +881,17 @@ async def _scroll_subsites_by_parent_id(self, parent_site_id):
852881
async def _recurse_sites(self, site_with_subsites):
853882
subsites = site_with_subsites.pop("sites", [])
854883
site_with_subsites.pop("[email protected]", None) # remove unnecessary field
855-
yield site_with_subsites
856-
if subsites:
884+
885+
is_excluded = _is_excluded_sharepoint_url(site_with_subsites.get("webUrl", ""))
886+
887+
if is_excluded:
888+
self._logger.debug(
889+
f"Skipping excluded SharePoint site: {site_with_subsites.get('webUrl', site_with_subsites.get('id', 'unknown'))}"
890+
)
891+
else:
892+
yield site_with_subsites
893+
894+
if subsites and not is_excluded:
857895
async for site in self._scroll_subsites_by_parent_id(
858896
site_with_subsites["id"]
859897
):
@@ -1113,6 +1151,12 @@ def _validate_sharepoint_rest_url(self, url):
11131151
if "OVERRIDE_URL" in os.environ:
11141152
return
11151153

1154+
# Exclude SharePoint Content Storage endpoints entirely
1155+
# These URLs are internal and should not be crawled by the connector
1156+
if _is_excluded_sharepoint_url(url):
1157+
# Silently return to let callers that explicitly skip excluded URLs proceed
1158+
return
1159+
11161160
# I haven't found a better way to validate tenant name for now.
11171161
actual_tenant_name = self._tenant_name_pattern.findall(url)[0]
11181162

0 commit comments

Comments
 (0)