From f4fc709b5aa28d24e45a0c938c7c166c61ef7cd5 Mon Sep 17 00:00:00 2001 From: ilongin Date: Fri, 27 Jun 2025 16:14:31 +0200 Subject: [PATCH 1/6] added env var and special method to get ful dataset name --- docs/guide/env.md | 4 ++++ src/datachain/catalog/catalog.py | 24 ++++++++++++++++++++++++ src/datachain/cli/commands/datasets.py | 14 ++++---------- src/datachain/lib/dc/datachain.py | 18 ++++++------------ src/datachain/lib/dc/datasets.py | 17 ++++++++--------- tests/unit/lib/test_datachain.py | 20 ++++++++++++++++++++ 6 files changed, 66 insertions(+), 31 deletions(-) diff --git a/docs/guide/env.md b/docs/guide/env.md index 92294e83b..2f8b82864 100644 --- a/docs/guide/env.md +++ b/docs/guide/env.md @@ -15,4 +15,8 @@ List of environment variables used to configure DataChain behavior. - `DATACHAIN_STUDIO_TOKEN` – Authentication token for Studio. - `DATACHAIN_STUDIO_TEAM` – Studio team name. +### Namespaces and projects +- `DATACHAIN_NAMESPACE` – Namespace name to use as default. +- `DATACHAIN_PROJECT` – Project name to use as default. + Note: Some environment variables are used internally and may not be documented here. For the most up-to-date list, refer to the source code. diff --git a/src/datachain/catalog/catalog.py b/src/datachain/catalog/catalog.py index 86f8d33d6..a6812ad34 100644 --- a/src/datachain/catalog/catalog.py +++ b/src/datachain/catalog/catalog.py @@ -1059,6 +1059,30 @@ def create_dataset_from_sources( return self.get_dataset(name, project) + def get_full_dataset_name( + self, + name: str, + project_name: Optional[str] = None, + namespace_name: Optional[str] = None, + ) -> tuple[str, str, str]: + """""" + parsed_namespace_name, parsed_project_name, name = parse_dataset_name(name) + + namespace_name = ( + parsed_namespace_name + or namespace_name + or os.environ.get("DATACHAIN_NAMESPACE") + or self.metastore.default_namespace_name + ) + project_name = ( + parsed_project_name + or project_name + or os.environ.get("DATACHAIN_PROJECT") + or self.metastore.default_project_name + ) + + return namespace_name, project_name, name + def get_dataset( self, name: str, project: Optional[Project] = None ) -> DatasetRecord: diff --git a/src/datachain/cli/commands/datasets.py b/src/datachain/cli/commands/datasets.py index 223c24d9d..ada09f199 100644 --- a/src/datachain/cli/commands/datasets.py +++ b/src/datachain/cli/commands/datasets.py @@ -8,7 +8,6 @@ from datachain.cli.utils import determine_flavors from datachain.config import Config -from datachain.dataset import parse_dataset_name from datachain.error import DataChainError, DatasetNotFoundError from datachain.studio import list_datasets as list_datasets_studio @@ -106,9 +105,8 @@ def list_datasets_local(catalog: "Catalog", name: Optional[str] = None): def list_datasets_local_versions(catalog: "Catalog", name: str): - namespace_name, project_name, name = parse_dataset_name(name) - namespace_name = namespace_name or catalog.metastore.default_namespace_name - project_name = project_name or catalog.metastore.default_project_name + namespace_name, project_name, name = catalog.get_full_dataset_name(name) + project = catalog.metastore.get_project(project_name, namespace_name) ds = catalog.get_dataset(name, project) for v in ds.versions: @@ -137,9 +135,7 @@ def rm_dataset( studio: Optional[bool] = False, team: Optional[str] = None, ): - namespace_name, project_name, name = parse_dataset_name(name) - namespace_name = namespace_name or catalog.metastore.default_namespace_name - project_name = project_name or catalog.metastore.default_project_name + namespace_name, project_name, name = catalog.get_full_dataset_name(name) if not catalog.metastore.is_local_dataset(namespace_name) and studio: from datachain.studio import remove_studio_dataset @@ -166,9 +162,7 @@ def edit_dataset( attrs: Optional[list[str]] = None, team: Optional[str] = None, ): - namespace_name, project_name, name = parse_dataset_name(name) - namespace_name = namespace_name or catalog.metastore.default_namespace_name - project_name = project_name or catalog.metastore.default_project_name + namespace_name, project_name, name = catalog.get_full_dataset_name(name) if catalog.metastore.is_local_dataset(namespace_name): try: diff --git a/src/datachain/lib/dc/datachain.py b/src/datachain/lib/dc/datachain.py index e55242396..2eeebc61e 100644 --- a/src/datachain/lib/dc/datachain.py +++ b/src/datachain/lib/dc/datachain.py @@ -24,7 +24,7 @@ from tqdm import tqdm from datachain import semver -from datachain.dataset import DatasetRecord, parse_dataset_name +from datachain.dataset import DatasetRecord from datachain.delta import delta_disabled from datachain.error import ProjectCreateNotAllowedError, ProjectNotFoundError from datachain.func import literal @@ -557,6 +557,7 @@ def save( # type: ignore[override] update_version: which part of the dataset version to automatically increase. Available values: `major`, `minor` or `patch`. Default is `patch`. """ + catalog = self.session.catalog if version is not None: semver.validate(version) @@ -570,17 +571,10 @@ def save( # type: ignore[override] " patch" ) - namespace_name, project_name, name = parse_dataset_name(name) - - namespace_name = ( - namespace_name - or self._settings.namespace - or self.session.catalog.metastore.default_namespace_name - ) - project_name = ( - project_name - or self._settings.project - or self.session.catalog.metastore.default_project_name + namespace_name, project_name, name = catalog.get_full_dataset_name( + name, + namespace_name=self._settings.namespace, + project_name=self._settings.project, ) try: diff --git a/src/datachain/lib/dc/datasets.py b/src/datachain/lib/dc/datasets.py index ad6cc0c17..bd9405755 100644 --- a/src/datachain/lib/dc/datasets.py +++ b/src/datachain/lib/dc/datasets.py @@ -1,7 +1,6 @@ from collections.abc import Sequence from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints -from datachain.dataset import parse_dataset_name from datachain.error import ( DatasetNotFoundError, DatasetVersionNotFoundError, @@ -125,11 +124,11 @@ def read_dataset( session = Session.get(session) catalog = session.catalog - namespace_name, project_name, name = parse_dataset_name(name) - namespace_name = ( - namespace_name or namespace or catalog.metastore.default_namespace_name + namespace_name, project_name, name = catalog.get_full_dataset_name( + name, + project_name=project, + namespace_name=namespace, ) - project_name = project_name or project or catalog.metastore.default_project_name if version is not None: try: @@ -320,11 +319,11 @@ def delete_dataset( session = Session.get(session, in_memory=in_memory) catalog = session.catalog - namespace_name, project_name, name = parse_dataset_name(name) - namespace_name = ( - namespace_name or namespace or catalog.metastore.default_namespace_name + namespace_name, project_name, name = catalog.get_full_dataset_name( + name, + project_name=project, + namespace_name=namespace, ) - project_name = project_name or project or catalog.metastore.default_project_name if not catalog.metastore.is_local_dataset(namespace_name) and studio: return remove_studio_dataset( diff --git a/tests/unit/lib/test_datachain.py b/tests/unit/lib/test_datachain.py index e38926da8..78bca9a8b 100644 --- a/tests/unit/lib/test_datachain.py +++ b/tests/unit/lib/test_datachain.py @@ -3445,6 +3445,26 @@ def test_save_specify_only_non_default_project( dc.read_dataset(name="fibonacci") +@pytest.mark.parametrize("use_settings", (True, False)) +def test_save_all_ways_to_set_project(test_session, monkeypatch): + catalog = test_session.catalog + catalog.metastore.create_project("n1", "p1") + catalog.metastore.create_project("n2", "p2") + p3 = catalog.metastore.create_project("n3", "p3") + + monkeypatch.setenv("DATACHAIN_NAMESPACE", "n1") + monkeypatch.setenv("DATACHAIN_PROJECT", "p1") + ds = ( + dc.read_values(fib=[1, 2, 3, 3, 5, 8], session=test_session) + .settings(namespace="n2", project="p2") + .save("n3.p3.numbers") + ) + + assert ds.project == p3 + + dc.read_dataset("n3.p3.numbers") + + @pytest.mark.parametrize("allow_create_project", [False]) @skip_if_not_sqlite def test_save_create_project_not_allowed(test_session, allow_create_project): From 0fe008afef1fd4917bfd208b325297276638d5b8 Mon Sep 17 00:00:00 2001 From: ilongin Date: Sat, 28 Jun 2025 00:35:19 +0200 Subject: [PATCH 2/6] finished test --- src/datachain/catalog/catalog.py | 10 ++++- src/datachain/lib/dc/records.py | 2 +- tests/unit/lib/test_datachain.py | 71 +++++++++++++++++++++++++------- 3 files changed, 66 insertions(+), 17 deletions(-) diff --git a/src/datachain/catalog/catalog.py b/src/datachain/catalog/catalog.py index a6812ad34..f5653a85b 100644 --- a/src/datachain/catalog/catalog.py +++ b/src/datachain/catalog/catalog.py @@ -1068,16 +1068,22 @@ def get_full_dataset_name( """""" parsed_namespace_name, parsed_project_name, name = parse_dataset_name(name) + namespace_env = os.environ.get("DATACHAIN_NAMESPACE") + project_env = os.environ.get("DATACHAIN_PROJECT") + if project_env and len(project_env.split(".")) == 2: + # we allow setting both namespace and project in DATACHAIN_PROJECT + namespace_env, project_env = project_env.split(".") + namespace_name = ( parsed_namespace_name or namespace_name - or os.environ.get("DATACHAIN_NAMESPACE") + or namespace_env or self.metastore.default_namespace_name ) project_name = ( parsed_project_name or project_name - or os.environ.get("DATACHAIN_PROJECT") + or project_env or self.metastore.default_project_name ) diff --git a/src/datachain/lib/dc/records.py b/src/datachain/lib/dc/records.py index 41c3610d9..a327f9045 100644 --- a/src/datachain/lib/dc/records.py +++ b/src/datachain/lib/dc/records.py @@ -97,4 +97,4 @@ def read_records( for chunk in batched(records, INSERT_BATCH_SIZE): warehouse.insert_rows(table, chunk) warehouse.insert_rows_done(table) - return read_dataset(name=dsr.name, session=session, settings=settings) + return read_dataset(name=dsr.full_name, session=session, settings=settings) diff --git a/tests/unit/lib/test_datachain.py b/tests/unit/lib/test_datachain.py index 78bca9a8b..4294f6d2d 100644 --- a/tests/unit/lib/test_datachain.py +++ b/tests/unit/lib/test_datachain.py @@ -3445,24 +3445,67 @@ def test_save_specify_only_non_default_project( dc.read_dataset(name="fibonacci") -@pytest.mark.parametrize("use_settings", (True, False)) -def test_save_all_ways_to_set_project(test_session, monkeypatch): - catalog = test_session.catalog - catalog.metastore.create_project("n1", "p1") - catalog.metastore.create_project("n2", "p2") - p3 = catalog.metastore.create_project("n3", "p3") +@pytest.mark.parametrize( + ( + "ds_name_namespace,ds_name_project," + "settings_namespace,settings_project," + "env_namespace,env_project," + "result_ds_namespace,result_ds_project" + ), + [ + ("n3", "p3", "n2", "p2", "n1", "p1", "n3", "p3"), + ("", "", "n2", "p2", "n1", "p1", "n2", "p2"), + ("", "", "", "", "n1", "p1", "n1", "p1"), + ("", "", "", "", "n5", "n1.p1", "n1", "p1"), + ("", "", "", "", "", "n1.p1", "n1", "p1"), + ("", "", "", "", "", "n1.p1", "n1", "p1"), + ("n3", "p3", "n2", "p2", "", "", "n3", "p3"), + ("n3", "p3", "", "", "", "", "n3", "p3"), + ("n3", "p3", "", "", "n1", "p1", "n3", "p3"), + ("", "", "", "", "", "", "", ""), + ], +) +def test_save_all_ways_to_set_project( + test_session, + monkeypatch, + ds_name_namespace, + ds_name_project, + settings_namespace, + settings_project, + env_namespace, + env_project, + result_ds_namespace, + result_ds_project, +): + def _full_name(namespace, project, name) -> str: + if namespace and project: + return f"{namespace}.{project}.{name}" + return name + + metastore = test_session.catalog.metastore + ds_name = "numbers" + metastore.create_project("n1", "p1") + metastore.create_project("n2", "p2") + metastore.create_project("n3", "p3") + + monkeypatch.setenv("DATACHAIN_NAMESPACE", env_namespace) + monkeypatch.setenv("DATACHAIN_PROJECT", env_project) + + if not result_ds_namespace and not result_ds_project: + # special case when nothing is defined - we set default ones + result_ds_namespace = metastore.default_namespace_name + result_ds_project = metastore.default_project_name - monkeypatch.setenv("DATACHAIN_NAMESPACE", "n1") - monkeypatch.setenv("DATACHAIN_PROJECT", "p1") ds = ( - dc.read_values(fib=[1, 2, 3, 3, 5, 8], session=test_session) - .settings(namespace="n2", project="p2") - .save("n3.p3.numbers") + dc.read_values(num=[1, 2, 3, 4], session=test_session) + .settings(namespace=settings_namespace, project=settings_project) + .save(_full_name(ds_name_namespace, ds_name_project, ds_name)) ) - assert ds.project == p3 - - dc.read_dataset("n3.p3.numbers") + assert ds.dataset.project == metastore.get_project( + result_ds_project, result_ds_namespace + ) + dc.read_dataset(_full_name(result_ds_namespace, result_ds_project, ds_name)) @pytest.mark.parametrize("allow_create_project", [False]) From a6bd0f2e529d6787f0c8f23a59012dcea78da95c Mon Sep 17 00:00:00 2001 From: ilongin Date: Sat, 28 Jun 2025 01:50:44 +0200 Subject: [PATCH 3/6] fixing docs --- docs/guide/env.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guide/env.md b/docs/guide/env.md index 2f8b82864..47715d93f 100644 --- a/docs/guide/env.md +++ b/docs/guide/env.md @@ -17,6 +17,6 @@ List of environment variables used to configure DataChain behavior. ### Namespaces and projects - `DATACHAIN_NAMESPACE` – Namespace name to use as default. -- `DATACHAIN_PROJECT` – Project name to use as default. +- `DATACHAIN_PROJECT` – Project name or combination of namespace name and project name separated by `.` to use as default, example: `DATACHAIN_PROJECT=dev.analytics` Note: Some environment variables are used internally and may not be documented here. For the most up-to-date list, refer to the source code. From caf097dd0ffd492fba2daffc82215a8a99762422 Mon Sep 17 00:00:00 2001 From: ilongin Date: Sat, 28 Jun 2025 02:16:08 +0200 Subject: [PATCH 4/6] adding docs about env variables to namespace.md --- docs/guide/namespaces.md | 46 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/docs/guide/namespaces.md b/docs/guide/namespaces.md index 92059b959..536120553 100644 --- a/docs/guide/namespaces.md +++ b/docs/guide/namespaces.md @@ -82,6 +82,49 @@ This is equivalent to saving to `dev.analytics.metrics`. In CLI, `.settings()` is only supported when both `namespace` and `project` are set to `"local"`. +## Setting Namespace and Project via Environment Variables + +In addition to using `.settings()`, you can configure the namespace and project using environment variables: + +- `DATACHAIN_NAMESPACE` sets the namespace. +- `DATACHAIN_PROJECT` sets the project name, or both the namespace and project using the format `namespace.project`. + +### Examples + +``` +# Set namespace only +export DATACHAIN_NAMESPACE=dev + +# Set project only +export DATACHAIN_PROJECT=analytics + +# Set both namespace and project +export DATACHAIN_PROJECT=dev.analytics +``` + +## How Namespace and Project Are Resolved + +When determining which namespace and project to use, Datachain applies the following precedence: + +1. **Fully qualified dataset name** + If the dataset name includes both the namespace and project, these values take highest precedence. + ```python + dc.read_dataset("dev.analytics.metrics") + +2. **Explicit settings in code** + Values provided via `.settings()` or passed directly to `read_dataset()` or similar methods. + ```python + dc.settings(namespace="dev", project="analytics") + dc.read_dataset("metrics", namespace="dev", project="analytics") + ``` +3. **Environment variables** + Namespace and project set using environment variables: + ```console + export DATACHAIN_PROJECT=dev.analytics + ``` +4. **Defaults** +If none of the above are provided, Datachain falls back to the default namespace and project. + ## Reading a Dataset from a Project To read a dataset from a specific namespace and project: @@ -115,5 +158,4 @@ import datachain as dc dc.read_values(scores=[0.8, 1.5, 2.1]).save("metrics") ds = dc.read_dataset("local.local.metrics") -ds.show() -``` +ds.sho From 6bc8b9b771360edb0ee69f06be0a6e8495e88f03 Mon Sep 17 00:00:00 2001 From: ilongin Date: Sat, 28 Jun 2025 02:54:01 +0200 Subject: [PATCH 5/6] fixing test and docs --- src/datachain/catalog/catalog.py | 5 ++++- tests/unit/lib/test_datachain.py | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/datachain/catalog/catalog.py b/src/datachain/catalog/catalog.py index f5653a85b..d0eba0962 100644 --- a/src/datachain/catalog/catalog.py +++ b/src/datachain/catalog/catalog.py @@ -1065,7 +1065,10 @@ def get_full_dataset_name( project_name: Optional[str] = None, namespace_name: Optional[str] = None, ) -> tuple[str, str, str]: - """""" + """ + Returns dataset name together with separated namespace and project name. + It takes into account all the ways namespace and project can be added. + """ parsed_namespace_name, parsed_project_name, name = parse_dataset_name(name) namespace_env = os.environ.get("DATACHAIN_NAMESPACE") diff --git a/tests/unit/lib/test_datachain.py b/tests/unit/lib/test_datachain.py index 4294f6d2d..60366f93c 100644 --- a/tests/unit/lib/test_datachain.py +++ b/tests/unit/lib/test_datachain.py @@ -3458,7 +3458,6 @@ def test_save_specify_only_non_default_project( ("", "", "", "", "n1", "p1", "n1", "p1"), ("", "", "", "", "n5", "n1.p1", "n1", "p1"), ("", "", "", "", "", "n1.p1", "n1", "p1"), - ("", "", "", "", "", "n1.p1", "n1", "p1"), ("n3", "p3", "n2", "p2", "", "", "n3", "p3"), ("n3", "p3", "", "", "", "", "n3", "p3"), ("n3", "p3", "", "", "n1", "p1", "n3", "p3"), From ff182b86714fee3bc68e07da45fa42221d7c445c Mon Sep 17 00:00:00 2001 From: ilongin Date: Sun, 29 Jun 2025 00:04:27 +0200 Subject: [PATCH 6/6] fixing validation of project / namespace name, adding tests, fixing docs --- docs/guide/namespaces.md | 2 +- src/datachain/data_storage/metastore.py | 15 ++++++- src/datachain/data_storage/sqlite.py | 8 +++- src/datachain/dataset.py | 4 +- tests/unit/lib/test_datachain.py | 56 +++++++++++++++++++++++-- tests/unit/test_dataset.py | 2 +- 6 files changed, 76 insertions(+), 11 deletions(-) diff --git a/docs/guide/namespaces.md b/docs/guide/namespaces.md index 536120553..6697d3775 100644 --- a/docs/guide/namespaces.md +++ b/docs/guide/namespaces.md @@ -158,4 +158,4 @@ import datachain as dc dc.read_values(scores=[0.8, 1.5, 2.1]).save("metrics") ds = dc.read_dataset("local.local.metrics") -ds.sho +ds.show() diff --git a/src/datachain/data_storage/metastore.py b/src/datachain/data_storage/metastore.py index 3af3e908f..b93bc1582 100644 --- a/src/datachain/data_storage/metastore.py +++ b/src/datachain/data_storage/metastore.py @@ -132,6 +132,7 @@ def create_namespace( description: Optional[str] = None, uuid: Optional[str] = None, ignore_if_exists: bool = True, + validate: bool = True, **kwargs, ) -> Namespace: """Creates new namespace""" @@ -192,6 +193,7 @@ def create_project( description: Optional[str] = None, uuid: Optional[str] = None, ignore_if_exists: bool = True, + validate: bool = True, **kwargs, ) -> Project: """Creates new project in specific namespace""" @@ -725,8 +727,11 @@ def create_namespace( description: Optional[str] = None, uuid: Optional[str] = None, ignore_if_exists: bool = True, + validate: bool = True, **kwargs, ) -> Namespace: + if validate: + Namespace.validate_name(name) query = self._namespaces_insert().values( name=name, uuid=uuid or str(uuid4()), @@ -775,12 +780,15 @@ def create_project( description: Optional[str] = None, uuid: Optional[str] = None, ignore_if_exists: bool = True, + validate: bool = True, **kwargs, ) -> Project: + if validate: + Project.validate_name(name) try: namespace = self.get_namespace(namespace_name) except NamespaceNotFoundError: - namespace = self.create_namespace(namespace_name) + namespace = self.create_namespace(namespace_name, validate=validate) query = self._projects_insert().values( namespace_id=namespace.id, @@ -817,11 +825,14 @@ def get_project( """Gets a single project inside some namespace by name""" n = self._namespaces p = self._projects + validate = True + if self._is_listing_project(name, namespace_name) or self._is_default_project( name, namespace_name ): # we are always creating default and listing projects if they don't exist create = True + validate = False query = self._projects_select( *(getattr(n.c, f) for f in self._namespaces_fields), @@ -834,7 +845,7 @@ def get_project( rows = list(self.db.execute(query, conn=conn)) if not rows: if create: - return self.create_project(namespace_name, name) + return self.create_project(namespace_name, name, validate=validate) raise ProjectNotFoundError( f"Project {name} in namespace {namespace_name} not found." ) diff --git a/src/datachain/data_storage/sqlite.py b/src/datachain/data_storage/sqlite.py index 15e68e24f..793d65a06 100644 --- a/src/datachain/data_storage/sqlite.py +++ b/src/datachain/data_storage/sqlite.py @@ -468,8 +468,12 @@ def _init_namespaces_projects(self) -> None: be created implicitly though, to keep the same fully qualified name with Studio dataset. """ - system_namespace = self.create_namespace(Namespace.system(), "System namespace") - self.create_project(system_namespace.name, Project.listing(), "Listing project") + system_namespace = self.create_namespace( + Namespace.system(), "System namespace", validate=False + ) + self.create_project( + system_namespace.name, Project.listing(), "Listing project", validate=False + ) def _check_schema_version(self) -> None: """ diff --git a/src/datachain/dataset.py b/src/datachain/dataset.py index a084c8f3e..a975af17e 100644 --- a/src/datachain/dataset.py +++ b/src/datachain/dataset.py @@ -81,8 +81,10 @@ def create_dataset_uri( def parse_dataset_name(name: str) -> tuple[Optional[str], Optional[str], str]: """Parses dataset name and returns namespace, project and name""" if not name: - raise ValueError("Name must be defined to parse it") + raise InvalidDatasetNameError("Name must be defined to parse it") split = name.split(".") + if len(split) > 3: + raise InvalidDatasetNameError(f"Invalid dataset name {name}") name = split[-1] project_name = split[-2] if len(split) > 1 else None namespace_name = split[-3] if len(split) > 2 else None diff --git a/tests/unit/lib/test_datachain.py b/tests/unit/lib/test_datachain.py index 60366f93c..56b101d12 100644 --- a/tests/unit/lib/test_datachain.py +++ b/tests/unit/lib/test_datachain.py @@ -20,6 +20,9 @@ DatasetInvalidVersionError, DatasetNotFoundError, DatasetVersionNotFoundError, + InvalidDatasetNameError, + InvalidNamespaceNameError, + InvalidProjectNameError, ProjectCreateNotAllowedError, ) from datachain.lib.data_model import DataModel @@ -3425,7 +3428,9 @@ def test_save_specify_only_non_default_project( default_namespace_name = catalog.metastore.default_namespace_name if project_created_upfront: - catalog.metastore.create_project(default_namespace_name, "numbers") + catalog.metastore.create_project( + default_namespace_name, "numbers", validate=False + ) ds = dc.read_values(fib=[1, 1, 2, 3, 5, 8], session=test_session) if use_settings: @@ -3458,6 +3463,7 @@ def test_save_specify_only_non_default_project( ("", "", "", "", "n1", "p1", "n1", "p1"), ("", "", "", "", "n5", "n1.p1", "n1", "p1"), ("", "", "", "", "", "n1.p1", "n1", "p1"), + ("", "", "", "", "", "n5.p5", "n5", "p5"), ("n3", "p3", "n2", "p2", "", "", "n3", "p3"), ("n3", "p3", "", "", "", "", "n3", "p3"), ("n3", "p3", "", "", "n1", "p1", "n3", "p3"), @@ -3483,9 +3489,6 @@ def _full_name(namespace, project, name) -> str: metastore = test_session.catalog.metastore ds_name = "numbers" - metastore.create_project("n1", "p1") - metastore.create_project("n2", "p2") - metastore.create_project("n3", "p3") monkeypatch.setenv("DATACHAIN_NAMESPACE", env_namespace) monkeypatch.setenv("DATACHAIN_PROJECT", env_project) @@ -3507,6 +3510,51 @@ def _full_name(namespace, project, name) -> str: dc.read_dataset(_full_name(result_ds_namespace, result_ds_project, ds_name)) +@pytest.mark.parametrize( + ( + "ds_name_namespace,ds_name_project," + "settings_namespace,settings_project," + "env_namespace,env_project," + "error" + ), + [ + ("n3.n3", "p3", "n2", "p2", "n1", "p1", InvalidDatasetNameError), + ("n3", "p3.p3", "n2", "p2", "n1", "p1", InvalidDatasetNameError), + ("", "", "n2.n2", "p2", "n1", "p1", InvalidNamespaceNameError), + ("", "", "n2", "p2.p2", "n1", "p1", InvalidProjectNameError), + ("", "", "", "", "n1.n1", "p1", InvalidNamespaceNameError), + ("", "", "", "", "n1", "p1.p1.p1", InvalidProjectNameError), + ], +) +def test_save_all_ways_to_set_project_invalid_name( + test_session, + monkeypatch, + ds_name_namespace, + ds_name_project, + settings_namespace, + settings_project, + env_namespace, + env_project, + error, +): + def _full_name(namespace, project, name) -> str: + if namespace and project: + return f"{namespace}.{project}.{name}" + return name + + ds_name = "numbers" + + monkeypatch.setenv("DATACHAIN_NAMESPACE", env_namespace) + monkeypatch.setenv("DATACHAIN_PROJECT", env_project) + + with pytest.raises(error): + ( + dc.read_values(num=[1, 2, 3, 4], session=test_session) + .settings(namespace=settings_namespace, project=settings_project) + .save(_full_name(ds_name_namespace, ds_name_project, ds_name)) + ) + + @pytest.mark.parametrize("allow_create_project", [False]) @skip_if_not_sqlite def test_save_create_project_not_allowed(test_session, allow_create_project): diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index ee865b12a..5e96ee8f4 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -178,5 +178,5 @@ def test_parse_dataset_name(full_name, namespace, project, name): def test_parse_dataset_name_empty_name(): - with pytest.raises(ValueError): + with pytest.raises(InvalidDatasetNameError): assert parse_dataset_name(None)