Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -283,4 +283,8 @@ def _get_default_document(
)

def _is_adls_directory(self, blob: BlobProperties) -> bool:
return blob.size == 0 and blob.metadata.get("hdi_isfolder") == "true"
return (
blob.size == 0
and blob.metadata is not None
and blob.metadata.get("hdi_isfolder") == "true"
)
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from tests.utils import (
CustomCSVLoader,
get_datalake_test_blobs,
get_expected_datalake_blobs,
get_expected_documents,
get_first_column_csv_loader,
get_test_blobs,
Expand Down Expand Up @@ -65,8 +64,8 @@ def upload_blobs_to_container(
([], None),
(None, None),
(None, "text"),
("text_file.txt", None),
(["text_file.txt", "json_file.json", "csv_file.csv"], None),
("directory/test_file.txt", None),
(["directory/test_file.txt", "json_file.json", "csv_file.csv"], None),
],
)
def test_lazy_load(
Expand Down Expand Up @@ -111,8 +110,8 @@ def test_lazy_load_with_loader_factory_configurations(
([], None),
(None, None),
(None, "text"),
("text_file.txt", None),
(["text_file.txt", "json_file.json", "csv_file.csv"], None),
("directory/test_file.txt", None),
(["directory/test_file.txt", "json_file.json", "csv_file.csv"], None),
],
)
async def test_alazy_load(
Expand Down Expand Up @@ -185,7 +184,7 @@ def datalake_container_setup(
"document-loader-tests"
)
container_client.create_container()
for blob in get_datalake_test_blobs():
for blob in get_datalake_test_blobs(include_directories=True):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the datalake tests, should we be including the directories in what we upload? To my understanding, ADLS will automatically create the directory and that is more representative of how customers will have ADLS directories in the first place.

blob_client = container_client.get_blob_client(blob["blob_name"])
blob_client.upload_blob(
blob["blob_content"], metadata=blob["metadata"], overwrite=True
Expand All @@ -205,7 +204,7 @@ def test_datalake_excludes_directories(
container_name=container_name,
)
expected_documents_list = get_expected_documents(
get_expected_datalake_blobs(), datalake_account_url, container_name
get_datalake_test_blobs(), datalake_account_url, container_name
)
assert list(loader.lazy_load()) == expected_documents_list

Expand All @@ -220,6 +219,6 @@ async def test_async_datalake_excludes_directories(
container_name=container_name,
)
expected_documents_list = get_expected_documents(
get_expected_datalake_blobs(), datalake_account_url, container_name
get_datalake_test_blobs(), datalake_account_url, container_name
)
assert [doc async for doc in loader.alazy_load()] == expected_documents_list
90 changes: 36 additions & 54 deletions libs/azure-storage/tests/unit_tests/test_document_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
from tests.utils import (
CustomCSVLoader,
get_datalake_test_blobs,
get_expected_datalake_blobs,
get_expected_documents,
get_first_column_csv_loader,
get_test_blobs,
get_test_mock_blobs,
)


Expand Down Expand Up @@ -64,13 +64,7 @@ def mock_container_client(
"langchain_azure_storage.document_loaders.ContainerClient"
) as mock_container_client_cls:
mock_client = MagicMock(spec=ContainerClient)
mock_blobs = []
for blob in get_test_blobs():
mock_blob = MagicMock()
mock_blob.name = blob["blob_name"]
mock_blobs.append(mock_blob)

mock_client.list_blobs.return_value = mock_blobs
mock_client.list_blobs.return_value = get_test_mock_blobs(get_test_blobs())
mock_client.get_blob_client.side_effect = get_mock_blob_client
mock_container_client_cls.return_value = mock_client
yield mock_container_client_cls, mock_client
Expand All @@ -87,7 +81,7 @@ def _get_blob_client(blob_name: str) -> MagicMock:
mock_blob_data = MagicMock(spec=StorageStreamDownloader)
content = next(
blob["blob_content"]
for blob in get_datalake_test_blobs()
for blob in get_datalake_test_blobs(include_directories=True)
if blob["blob_name"] == blob_name
)
mock_blob_data.readall.return_value = content.encode("utf-8")
Expand All @@ -105,15 +99,9 @@ def mock_datalake_container_client(
"langchain_azure_storage.document_loaders.ContainerClient"
) as mock_container_client_cls:
mock_client = MagicMock(spec=ContainerClient)
mock_blobs = []
for blob in get_datalake_test_blobs():
mock_blob = MagicMock()
mock_blob.name = blob["blob_name"]
mock_blob.size = blob["size"]
mock_blob.metadata = blob["metadata"]
mock_blobs.append(mock_blob)

mock_client.list_blobs.return_value = mock_blobs
mock_client.list_blobs.return_value = get_test_mock_blobs(
get_datalake_test_blobs(include_directories=True)
)
mock_client.get_blob_client.side_effect = get_mock_datalake_blob_client
mock_container_client_cls.return_value = mock_client
yield mock_container_client_cls, mock_client
Expand Down Expand Up @@ -150,9 +138,7 @@ def async_mock_container_client(

async def get_async_blobs(**kwargs: Any) -> AsyncIterator[MagicMock]:
prefix = kwargs.get("name_starts_with")
for blob in get_test_blobs(prefix=prefix):
mock_blob = MagicMock()
mock_blob.name = blob["blob_name"]
for mock_blob in get_test_mock_blobs(get_test_blobs(), prefix):
yield mock_blob

async_mock_client = AsyncMock(spec=AsyncContainerClient)
Expand All @@ -173,7 +159,7 @@ def _get_async_blob_client(blob_name: str) -> AsyncMock:
mock_blob_data = AsyncMock(spec=AsyncStorageStreamDownloader)
content = next(
blob["blob_content"]
for blob in get_datalake_test_blobs()
for blob in get_datalake_test_blobs(include_directories=True)
if blob["blob_name"] == blob_name
)
mock_blob_data.readall.return_value = content.encode("utf-8")
Expand All @@ -193,11 +179,9 @@ def async_mock_datalake_container_client(

async def get_async_blobs(**kwargs: Any) -> AsyncIterator[MagicMock]:
prefix = kwargs.get("name_starts_with")
for blob in get_datalake_test_blobs(prefix=prefix):
mock_blob = MagicMock()
mock_blob.name = blob["blob_name"]
mock_blob.size = int(blob["size"])
mock_blob.metadata = blob["metadata"]
for mock_blob in get_test_mock_blobs(
get_datalake_test_blobs(include_directories=True), prefix=prefix
):
yield mock_blob

async_mock_client = AsyncMock(spec=AsyncContainerClient)
Expand All @@ -224,8 +208,8 @@ def test_lazy_load(
@pytest.mark.parametrize(
"blob_names",
[
"text_file.txt",
["text_file.txt", "json_file.json"],
"directory/test_file.txt",
["directory/test_file.txt", "json_file.json"],
],
)
def test_lazy_load_with_blob_names(
Expand All @@ -249,16 +233,14 @@ def test_get_blob_client(
mock_container_client: Tuple[MagicMock, MagicMock],
) -> None:
_, mock_client = mock_container_client
mock_blob = MagicMock()
mock_blob.name = "text_file.txt"
mock_blob.size = 12
mock_client.list_blobs.return_value = [mock_blob]

loader = create_azure_blob_storage_loader(prefix="text")
mock_client.list_blobs.return_value = get_test_mock_blobs(
get_test_blobs(blob_names=["json_file.json"])
)
loader = create_azure_blob_storage_loader(prefix="json")
list(loader.lazy_load())
mock_client.get_blob_client.assert_called_once_with("text_file.txt")
mock_client.get_blob_client.assert_called_once_with("json_file.json")
mock_client.list_blobs.assert_called_once_with(
name_starts_with="text", include="metadata"
name_starts_with="json", include="metadata"
)


Expand All @@ -267,7 +249,7 @@ def test_default_credential(
create_azure_blob_storage_loader: Callable[..., AzureBlobStorageLoader],
) -> None:
mock_container_client_cls, _ = mock_container_client
loader = create_azure_blob_storage_loader(blob_names="text_file.txt")
loader = create_azure_blob_storage_loader(blob_names="directory/test_file.txt")
list(loader.lazy_load())
cred = mock_container_client_cls.call_args[1]["credential"]
assert isinstance(cred, azure.identity.DefaultAzureCredential)
Expand All @@ -282,7 +264,7 @@ def test_override_credential(
mock_container_client_cls, _ = mock_container_client
mock_credential = AzureSasCredential("test_sas_token")
loader = create_azure_blob_storage_loader(
blob_names="text_file.txt", credential=mock_credential
blob_names="directory/test_file.txt", credential=mock_credential
)
list(loader.lazy_load())
assert mock_container_client_cls.call_args[1]["credential"] is mock_credential
Expand All @@ -295,7 +277,7 @@ def test_async_credential_provided_to_sync(

mock_credential = DefaultAzureCredential()
loader = create_azure_blob_storage_loader(
blob_names="text_file.txt", credential=mock_credential
blob_names="directory/test_file.txt", credential=mock_credential
)
with pytest.raises(ValueError, match="Cannot use synchronous load"):
list(loader.lazy_load())
Expand All @@ -307,7 +289,7 @@ def test_invalid_credential_type(
mock_credential = "account-key"
with pytest.raises(TypeError, match="Invalid credential type provided."):
create_azure_blob_storage_loader(
blob_names="text_file.txt", credential=mock_credential
blob_names="directory/test_file.txt", credential=mock_credential
)


Expand All @@ -316,7 +298,7 @@ def test_both_blob_names_and_prefix_set(
) -> None:
with pytest.raises(ValueError, match="Cannot specify both blob_names and prefix."):
create_azure_blob_storage_loader(
blob_names=[blob["blob_name"] for blob in get_test_blobs()], prefix="text"
blob_names=[blob["blob_name"] for blob in get_test_blobs()], prefix="json"
)


Expand Down Expand Up @@ -355,8 +337,8 @@ async def test_alazy_load(
@pytest.mark.parametrize(
"blob_names",
[
"text_file.txt",
["text_file.txt", "json_file.json"],
"directory/test_file.txt",
["directory/test_file.txt", "json_file.json"],
],
)
async def test_alazy_load_with_blob_names(
Expand All @@ -380,11 +362,11 @@ async def test_get_async_blob_client(
async_mock_container_client: Tuple[AsyncMock, AsyncMock],
) -> None:
_, async_mock_client = async_mock_container_client
loader = create_azure_blob_storage_loader(prefix="text")
loader = create_azure_blob_storage_loader(prefix="json")
[doc async for doc in loader.alazy_load()]
async_mock_client.get_blob_client.assert_called_once_with("text_file.txt")
async_mock_client.get_blob_client.assert_called_once_with("json_file.json")
async_mock_client.list_blobs.assert_called_once_with(
name_starts_with="text", include="metadata"
name_starts_with="json", include="metadata"
)


Expand All @@ -397,7 +379,7 @@ async def test_async_token_credential(
async_mock_container_client_cls, _ = async_mock_container_client
mock_credential = AsyncMock(spec=AsyncTokenCredential)
loader = create_azure_blob_storage_loader(
blob_names="text_file.txt", credential=mock_credential
blob_names="json_file.json", credential=mock_credential
)
[doc async for doc in loader.alazy_load()]
assert async_mock_container_client_cls.call_args[1]["credential"] is mock_credential
Expand All @@ -408,7 +390,7 @@ async def test_default_async_credential(
create_azure_blob_storage_loader: Callable[..., AzureBlobStorageLoader],
) -> None:
async_mock_container_client_cls, _ = async_mock_container_client
loader = create_azure_blob_storage_loader(blob_names="text_file.txt")
loader = create_azure_blob_storage_loader(blob_names="json_file.json")
[doc async for doc in loader.alazy_load()]
cred = async_mock_container_client_cls.call_args[1]["credential"]
assert isinstance(cred, azure.identity.aio.DefaultAzureCredential)
Expand All @@ -420,7 +402,7 @@ async def test_sync_credential_provided_to_async(
from azure.identity import DefaultAzureCredential

loader = create_azure_blob_storage_loader(
blob_names="text_file.txt", credential=DefaultAzureCredential()
blob_names="json_file.json", credential=DefaultAzureCredential()
)
with pytest.raises(ValueError, match="Cannot use asynchronous load"):
[doc async for doc in loader.alazy_load()]
Expand Down Expand Up @@ -454,7 +436,7 @@ def test_user_agent(
) -> None:
mock_container_client_cls, _ = mock_container_client
user_agent = f"azpartner-langchain/{__version__}"
loader = create_azure_blob_storage_loader(blob_names="text_file.txt")
loader = create_azure_blob_storage_loader(blob_names="json_file.json")
list(loader.lazy_load())
client_kwargs = mock_container_client_cls.call_args[1]
assert client_kwargs["user_agent"] == user_agent
Expand All @@ -466,7 +448,7 @@ async def test_async_user_agent(
) -> None:
async_mock_container_client_cls, _ = async_mock_container_client
user_agent = f"azpartner-langchain/{__version__}"
loader = create_azure_blob_storage_loader(blob_names="text_file.txt")
loader = create_azure_blob_storage_loader(blob_names="json_file.json")
[doc async for doc in loader.alazy_load()]
client_kwargs = async_mock_container_client_cls.call_args[1]
assert client_kwargs["user_agent"] == user_agent
Expand All @@ -480,7 +462,7 @@ def test_datalake_excludes_directories(
) -> None:
loader = create_azure_blob_storage_loader()
expected_documents = get_expected_documents(
get_expected_datalake_blobs(), account_url, container_name
get_datalake_test_blobs(), account_url, container_name
)
assert list(loader.lazy_load()) == expected_documents

Expand All @@ -493,6 +475,6 @@ async def test_async_datalake_excludes_directories(
) -> None:
loader = create_azure_blob_storage_loader()
expected_documents = get_expected_documents(
get_expected_datalake_blobs(), account_url, container_name
get_datalake_test_blobs(), account_url, container_name
)
assert [doc async for doc in loader.alazy_load()] == expected_documents
Loading