7575WILDCARD = "*"
7676DRIVE_ITEMS_FIELDS = "id,content.downloadUrl,lastModifiedDateTime,lastModifiedBy,root,deleted,file,folder,package,name,webUrl,createdBy,createdDateTime,size,parentReference"
7777
78+ # Exclude specific SharePoint paths entirely at the connector level (pre sync-rules)
79+ EXCLUDED_SHAREPOINT_PATH_SEGMENTS = ["/contentstorage/" ]
80+
81+
82+ def _is_excluded_sharepoint_url (url : str ) -> bool :
83+ try :
84+ return any (
85+ segment in url .lower () for segment in EXCLUDED_SHAREPOINT_PATH_SEGMENTS
86+ )
87+ except Exception :
88+ return False
89+
90+
7891CURSOR_SITE_DRIVE_KEY = "site_drives"
7992
8093# Microsoft Graph API Delta constants
@@ -784,6 +797,11 @@ async def sites(
784797 if allowed_root_sites == [WILDCARD ] or enumerate_all_sites :
785798 self ._logger .debug (f"Looking up all sites to fetch: { allowed_root_sites } " )
786799 async for site in self ._all_sites (sharepoint_host , allowed_root_sites ):
800+ if _is_excluded_sharepoint_url (site .get ("webUrl" , "" )):
801+ self ._logger .debug (
802+ f"Skipping excluded SharePoint site: { site .get ('webUrl' , site .get ('id' , 'unknown' ))} "
803+ )
804+ continue
787805 yield site
788806 else :
789807 self ._logger .debug (f"Looking up individual sites: { allowed_root_sites } " )
@@ -793,9 +811,20 @@ async def sites(
793811 async for site in self ._fetch_site_and_subsites_by_path (
794812 sharepoint_host , allowed_site
795813 ):
814+ if _is_excluded_sharepoint_url (site .get ("webUrl" , "" )):
815+ self ._logger .debug (
816+ f"Skipping excluded SharePoint site: { site .get ('webUrl' , site .get ('id' , 'unknown' ))} "
817+ )
818+ continue
796819 yield site
797820 else :
798- yield await self ._fetch_site (sharepoint_host , allowed_site )
821+ site_obj = await self ._fetch_site (sharepoint_host , allowed_site )
822+ if _is_excluded_sharepoint_url (site_obj .get ("webUrl" , "" )):
823+ self ._logger .debug (
824+ f"Skipping excluded SharePoint site: { site_obj .get ('webUrl' , site_obj .get ('id' , 'unknown' ))} "
825+ )
826+ continue
827+ yield site_obj
799828
800829 except NotFound :
801830 self ._logger .warning (
@@ -852,8 +881,17 @@ async def _scroll_subsites_by_parent_id(self, parent_site_id):
852881 async def _recurse_sites (self , site_with_subsites ):
853882 subsites = site_with_subsites .pop ("sites" , [])
854883 site_with_subsites .
pop (
"[email protected] " ,
None )
# remove unnecessary field 855- yield site_with_subsites
856- if subsites :
884+
885+ is_excluded = _is_excluded_sharepoint_url (site_with_subsites .get ("webUrl" , "" ))
886+
887+ if is_excluded :
888+ self ._logger .debug (
889+ f"Skipping excluded SharePoint site: { site_with_subsites .get ('webUrl' , site_with_subsites .get ('id' , 'unknown' ))} "
890+ )
891+ else :
892+ yield site_with_subsites
893+
894+ if subsites and not is_excluded :
857895 async for site in self ._scroll_subsites_by_parent_id (
858896 site_with_subsites ["id" ]
859897 ):
@@ -1113,6 +1151,12 @@ def _validate_sharepoint_rest_url(self, url):
11131151 if "OVERRIDE_URL" in os .environ :
11141152 return
11151153
1154+ # Exclude SharePoint Content Storage endpoints entirely
1155+ # These URLs are internal and should not be crawled by the connector
1156+ if _is_excluded_sharepoint_url (url ):
1157+ # Silently return to let callers that explicitly skip excluded URLs proceed
1158+ return
1159+
11161160 # I haven't found a better way to validate tenant name for now.
11171161 actual_tenant_name = self ._tenant_name_pattern .findall (url )[0 ]
11181162
0 commit comments