Skip to content

Commit 47f6be2

Browse files
authored
Reimplement fs.info() to not depend on fs.ls() (#390)
* Simplify code in _details() * Remove duplicate call to self._details() in _ls_blobs() * Replace crashing condition with a probably useless assert * Flip condition and dedent a lot of code * Ensure that fs.info(container) returns consistent results. When the container's info cannot be obtained from cache, info() now calls get_container_properties() to return all the info available. Fixes #377. * Ensure consistency between fs.info() and fs._info() * Don't call _ls() from _info() _info() now only queries the properties of its argument, instead of potentially doing a full listing of the parent directory. Note: due to apparent inconsistencies in Azure, info["deleted"] may now be either False or None for the same path, depending on whether the info dict comes from .info() or .ls(..., detail=True). Also fixes .exists() mishandling paths like "non-existent-container/key". Fixes #380.
1 parent 741a042 commit 47f6be2

File tree

2 files changed

+219
-226
lines changed

2 files changed

+219
-226
lines changed

adlfs/spec.py

Lines changed: 146 additions & 165 deletions
Original file line numberDiff line numberDiff line change
@@ -497,21 +497,6 @@ def split_path(
497497
version_id if self.version_aware and version_id else None,
498498
)
499499

500-
def info(self, path, refresh=False, **kwargs):
501-
try:
502-
fetch_from_azure = (path and self._ls_from_cache(path) is None) or refresh
503-
except Exception:
504-
fetch_from_azure = True
505-
if fetch_from_azure:
506-
return sync(
507-
self.loop,
508-
self._info,
509-
path,
510-
refresh,
511-
version_id=kwargs.get("version_id"),
512-
)
513-
return super().info(path)
514-
515500
def modified(self, path: str) -> datetime:
516501
return self.info(path)["last_modified"]
517502

@@ -531,44 +516,60 @@ async def _info(self, path, refresh=False, **kwargs):
531516
dict with keys: name (full path in the FS), size (in bytes), type (file,
532517
directory, or something else) and other FS-specific keys.
533518
"""
534-
if refresh:
535-
invalidate_cache = True
536-
else:
537-
invalidate_cache = False
538519
container, path, path_version_id = self.split_path(path)
539520
fullpath = "/".join([container, path]) if path else container
540521
version_id = _coalesce_version_id(path_version_id, kwargs.get("version_id"))
541522
kwargs["version_id"] = version_id
542523

543-
out = await self._ls(
544-
fullpath, detail=True, invalidate_cache=invalidate_cache, **kwargs
545-
)
546-
fullpath = fullpath.rstrip("/")
547-
out1 = [
548-
o
549-
for o in out
550-
if o["name"].rstrip("/") == fullpath
551-
and (version_id is None or o["version_id"] == version_id)
552-
]
553-
if len(out1) == 1:
554-
if "size" not in out1[0]:
555-
out1[0]["size"] = None
556-
return out1[0]
557-
elif len(out1) > 1 or out:
558-
return {"name": fullpath, "size": None, "type": "directory"}
559-
else:
560-
# Check the directory listing as the path may have been deleted
561-
out = await self._ls(
562-
self._parent(path),
563-
detail=True,
564-
invalidate_cache=invalidate_cache,
565-
**kwargs,
566-
)
567-
out = [o for o in out if o["name"].rstrip("/") == path]
568-
if out:
569-
return out[0]
570-
else:
571-
raise FileNotFoundError
524+
if fullpath == "":
525+
return {"name": "", "size": None, "type": "directory"}
526+
elif path == "":
527+
if not refresh and _ROOT_PATH in self.dircache:
528+
out = [o for o in self.dircache[_ROOT_PATH] if o["name"] == container]
529+
if out:
530+
return out[0]
531+
try:
532+
async with self.service_client.get_container_client(
533+
container=container
534+
) as cc:
535+
properties = await cc.get_container_properties()
536+
except ResourceNotFoundError as exc:
537+
raise FileNotFoundError from exc
538+
info = (await self._details([properties]))[0]
539+
# Make result consistent with _ls_containers()
540+
if not info.get("metadata"):
541+
info["metadata"] = None
542+
return info
543+
544+
if not refresh:
545+
out = self._ls_from_cache(fullpath)
546+
if out is not None:
547+
if self.version_aware and version_id is not None:
548+
out = [
549+
o
550+
for o in out
551+
if o["name"] == fullpath and match_blob_version(o, version_id)
552+
]
553+
if out:
554+
return out[0]
555+
else:
556+
out = [o for o in out if o["name"] == fullpath]
557+
if out:
558+
return out[0]
559+
return {"name": fullpath, "size": None, "type": "directory"}
560+
561+
try:
562+
async with self.service_client.get_blob_client(container, path) as bc:
563+
props = await bc.get_blob_properties(version_id=version_id)
564+
return (await self._details([props]))[0]
565+
except ResourceNotFoundError:
566+
pass
567+
568+
if not version_id:
569+
if await self._dir_exists(container, path):
570+
return {"name": fullpath, "size": None, "type": "directory"}
571+
572+
raise FileNotFoundError
572573

573574
def glob(self, path, **kwargs):
574575
return sync(self.loop, self._glob, path)
@@ -679,87 +680,74 @@ async def _ls_blobs(
679680
)
680681

681682
if (
682-
target_path not in self.dircache
683-
or return_glob
684-
or versions
685-
or not any(
683+
target_path in self.dircache
684+
and not return_glob
685+
and not versions
686+
and all(
686687
match_blob_version(b, version_id) for b in self.dircache[target_path]
687688
)
688689
):
689-
if container not in ["", delimiter]:
690-
# This is the case where the container name is passed
691-
async with self.service_client.get_container_client(
692-
container=container
693-
) as cc:
694-
path = path.strip("/")
695-
include = ["metadata"]
696-
if version_id is not None or versions:
697-
assert self.version_aware
698-
include.append("versions")
699-
blobs = cc.walk_blobs(include=include, name_starts_with=path)
700-
701-
# Check the depth that needs to be screened
702-
depth = target_path.count("/")
703-
outblobs = []
704-
try:
705-
async for next_blob in blobs:
706-
if depth in [0, 1] and path == "":
707-
outblobs.append(next_blob)
708-
elif isinstance(next_blob, BlobProperties):
709-
if next_blob["name"].count("/") == depth:
710-
outblobs.append(next_blob)
711-
elif not next_blob["name"].endswith("/") and (
712-
next_blob["name"].count("/") == (depth - 1)
713-
):
714-
outblobs.append(next_blob)
715-
else:
716-
async for blob_ in next_blob:
717-
if isinstance(blob_, BlobProperties) or isinstance(
718-
blob_, BlobPrefix
690+
return self.dircache[target_path]
691+
692+
assert container not in ["", delimiter]
693+
async with self.service_client.get_container_client(container=container) as cc:
694+
path = path.strip("/")
695+
include = ["metadata"]
696+
if version_id is not None or versions:
697+
assert self.version_aware
698+
include.append("versions")
699+
blobs = cc.walk_blobs(include=include, name_starts_with=path)
700+
701+
# Check the depth that needs to be screened
702+
depth = target_path.count("/")
703+
outblobs = []
704+
try:
705+
async for next_blob in blobs:
706+
if depth in [0, 1] and path == "":
707+
outblobs.append(next_blob)
708+
elif isinstance(next_blob, BlobProperties):
709+
if next_blob["name"].count("/") == depth:
710+
outblobs.append(next_blob)
711+
elif not next_blob["name"].endswith("/") and (
712+
next_blob["name"].count("/") == (depth - 1)
713+
):
714+
outblobs.append(next_blob)
715+
else:
716+
async for blob_ in next_blob:
717+
if isinstance(blob_, BlobProperties) or isinstance(
718+
blob_, BlobPrefix
719+
):
720+
if blob_["name"].endswith("/"):
721+
if blob_["name"].rstrip("/").count("/") == depth:
722+
outblobs.append(blob_)
723+
elif blob_["name"].count("/") == depth and (
724+
hasattr(blob_, "size") and blob_["size"] == 0
719725
):
720-
if blob_["name"].endswith("/"):
721-
if (
722-
blob_["name"].rstrip("/").count("/")
723-
== depth
724-
):
725-
outblobs.append(blob_)
726-
elif blob_["name"].count("/") == depth and (
727-
hasattr(blob_, "size")
728-
and blob_["size"] == 0
729-
):
730-
outblobs.append(blob_)
731-
else:
732-
pass
733-
elif blob_["name"].count("/") == (depth):
734-
outblobs.append(blob_)
735-
else:
736-
pass
737-
except ResourceNotFoundError:
738-
raise FileNotFoundError
739-
finalblobs = await self._details(
740-
outblobs,
741-
target_path=target_path,
742-
return_glob=return_glob,
743-
version_id=version_id,
744-
versions=versions,
745-
)
746-
if return_glob:
747-
return finalblobs
748-
finalblobs = await self._details(
749-
outblobs,
750-
target_path=target_path,
751-
version_id=version_id,
752-
versions=versions,
753-
)
754-
if not finalblobs:
755-
if not await self._exists(target_path):
756-
raise FileNotFoundError
757-
return []
758-
if not self.version_aware or finalblobs[0].get("is_current_version"):
759-
self.dircache[target_path] = finalblobs
760-
return finalblobs
761-
762-
return self.dircache[target_path]
726+
outblobs.append(blob_)
727+
else:
728+
pass
729+
elif blob_["name"].count("/") == (depth):
730+
outblobs.append(blob_)
731+
else:
732+
pass
733+
except ResourceNotFoundError:
734+
raise FileNotFoundError
735+
finalblobs = await self._details(
736+
outblobs,
737+
target_path=target_path,
738+
return_glob=return_glob,
739+
version_id=version_id,
740+
versions=versions,
741+
)
742+
if return_glob:
743+
return finalblobs
744+
if not finalblobs:
745+
if not await self._exists(target_path):
746+
raise FileNotFoundError
747+
return []
748+
if not self.version_aware or finalblobs[0].get("is_current_version"):
749+
self.dircache[target_path] = finalblobs
750+
return finalblobs
763751

764752
async def _ls(
765753
self,
@@ -880,41 +868,29 @@ async def _details(
880868
fname = f"{content.container}{delimiter}{content.name}"
881869
fname = fname.rstrip(delimiter)
882870
if content.has_key("size"): # NOQA
883-
data.update({"name": fname})
884-
data.update({"size": content.size})
885-
data.update({"type": "file"})
871+
data["name"] = fname
872+
data["size"] = content.size
873+
data["type"] = "file"
886874
else:
887-
data.update({"name": fname})
888-
data.update({"size": None})
889-
data.update({"type": "directory"})
875+
data["name"] = fname
876+
data["size"] = None
877+
data["type"] = "directory"
890878
else:
891879
fname = f"{content.name}"
892-
data.update({"name": fname})
893-
data.update({"size": None})
894-
data.update({"type": "directory"})
895-
if "metadata" in data.keys():
896-
if data["metadata"] is not None:
897-
if (
898-
"is_directory" in data["metadata"].keys()
899-
and data["metadata"]["is_directory"] == "true"
900-
):
901-
data.update({"type": "directory"})
902-
data.update({"size": None})
903-
elif (
904-
"is_directory" in data["metadata"].keys()
905-
and data["metadata"]["is_directory"] == "false"
906-
):
907-
data.update({"type": "file"})
908-
elif (
909-
"hdi_isfolder" in data["metadata"].keys()
910-
and data["metadata"]["hdi_isfolder"] == "true"
911-
):
912-
data.update({"type": "directory"})
913-
data.update({"size": None})
914-
else:
915-
pass
880+
data["name"] = fname
881+
data["size"] = None
882+
data["type"] = "directory"
883+
if data.get("metadata"):
884+
if data["metadata"].get("is_directory") == "true":
885+
data["type"] = "directory"
886+
data["size"] = None
887+
elif data["metadata"].get("is_directory") == "false":
888+
data["type"] = "file"
889+
elif data["metadata"].get("hdi_isfolder") == "true":
890+
data["type"] = "directory"
891+
data["size"] = None
916892
if return_glob:
917-
data.update({"name": data["name"].rstrip("/")})
893+
data["name"] = data["name"].rstrip("/")
918894
output.append(data)
919895
if target_path:
920896
if (
@@ -1413,17 +1389,22 @@ async def _exists(self, path):
14131389
if version_id is not None:
14141390
return False
14151391
raise
1392+
return await self._dir_exists(container_name, path)
14161393

1394+
async def _dir_exists(self, container, path):
14171395
dir_path = path.rstrip("/") + "/"
1418-
async with self.service_client.get_container_client(
1419-
container=container_name
1420-
) as container_client:
1421-
async for blob in container_client.list_blobs(
1422-
results_per_page=1, name_starts_with=dir_path
1423-
):
1424-
return True
1425-
else:
1426-
return False
1396+
try:
1397+
async with self.service_client.get_container_client(
1398+
container=container
1399+
) as container_client:
1400+
async for blob in container_client.list_blobs(
1401+
results_per_page=1, name_starts_with=dir_path
1402+
):
1403+
return True
1404+
else:
1405+
return False
1406+
except ResourceNotFoundError:
1407+
return False
14271408

14281409
async def _pipe_file(self, path, value, overwrite=True, **kwargs):
14291410
"""Set the bytes of given file"""

0 commit comments

Comments
 (0)