Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/guide/env.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,8 @@ List of environment variables used to configure DataChain behavior.
- `DATACHAIN_STUDIO_TOKEN` – Authentication token for Studio.
- `DATACHAIN_STUDIO_TEAM` – Studio team name.

### Namespaces and projects
- `DATACHAIN_NAMESPACE` – Namespace name to use as default.
- `DATACHAIN_PROJECT` – Project name or combination of namespace name and project name separated by `.` to use as default, example: `DATACHAIN_PROJECT=dev.analytics`

Note: Some environment variables are used internally and may not be documented here. For the most up-to-date list, refer to the source code.
44 changes: 43 additions & 1 deletion docs/guide/namespaces.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,49 @@ This is equivalent to saving to `dev.analytics.metrics`.

In CLI, `.settings()` is only supported when both `namespace` and `project` are set to `"local"`.

## Setting Namespace and Project via Environment Variables

In addition to using `.settings()`, you can configure the namespace and project using environment variables:

- `DATACHAIN_NAMESPACE` sets the namespace.
- `DATACHAIN_PROJECT` sets the project name, or both the namespace and project using the format `namespace.project`.

### Examples

```
# Set namespace only
export DATACHAIN_NAMESPACE=dev

# Set project only
export DATACHAIN_PROJECT=analytics

# Set both namespace and project
export DATACHAIN_PROJECT=dev.analytics
```

## How Namespace and Project Are Resolved

When determining which namespace and project to use, Datachain applies the following precedence:

1. **Fully qualified dataset name**
If the dataset name includes both the namespace and project, these values take highest precedence.
```python
dc.read_dataset("dev.analytics.metrics")

2. **Explicit settings in code**
Values provided via `.settings()` or passed directly to `read_dataset()` or similar methods.
```python
dc.settings(namespace="dev", project="analytics")
dc.read_dataset("metrics", namespace="dev", project="analytics")
```
3. **Environment variables**
Namespace and project set using environment variables:
```console
export DATACHAIN_PROJECT=dev.analytics
```
4. **Defaults**
If none of the above are provided, Datachain falls back to the default namespace and project.

## Reading a Dataset from a Project

To read a dataset from a specific namespace and project:
Expand Down Expand Up @@ -116,4 +159,3 @@ dc.read_values(scores=[0.8, 1.5, 2.1]).save("metrics")

ds = dc.read_dataset("local.local.metrics")
ds.show()
```
33 changes: 33 additions & 0 deletions src/datachain/catalog/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -1059,6 +1059,39 @@ def create_dataset_from_sources(

return self.get_dataset(name, project)

def get_full_dataset_name(
self,
name: str,
project_name: Optional[str] = None,
namespace_name: Optional[str] = None,
) -> tuple[str, str, str]:
"""
Returns dataset name together with separated namespace and project name.
It takes into account all the ways namespace and project can be added.
"""
parsed_namespace_name, parsed_project_name, name = parse_dataset_name(name)

namespace_env = os.environ.get("DATACHAIN_NAMESPACE")
project_env = os.environ.get("DATACHAIN_PROJECT")
if project_env and len(project_env.split(".")) == 2:
# we allow setting both namespace and project in DATACHAIN_PROJECT
namespace_env, project_env = project_env.split(".")

namespace_name = (
parsed_namespace_name
or namespace_name
or namespace_env
or self.metastore.default_namespace_name
)
project_name = (
parsed_project_name
or project_name
or project_env
or self.metastore.default_project_name
)

return namespace_name, project_name, name

def get_dataset(
self, name: str, project: Optional[Project] = None
) -> DatasetRecord:
Expand Down
14 changes: 4 additions & 10 deletions src/datachain/cli/commands/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

from datachain.cli.utils import determine_flavors
from datachain.config import Config
from datachain.dataset import parse_dataset_name
from datachain.error import DataChainError, DatasetNotFoundError
from datachain.studio import list_datasets as list_datasets_studio

Expand Down Expand Up @@ -106,9 +105,8 @@


def list_datasets_local_versions(catalog: "Catalog", name: str):
namespace_name, project_name, name = parse_dataset_name(name)
namespace_name = namespace_name or catalog.metastore.default_namespace_name
project_name = project_name or catalog.metastore.default_project_name
namespace_name, project_name, name = catalog.get_full_dataset_name(name)

Check warning on line 108 in src/datachain/cli/commands/datasets.py

View check run for this annotation

Codecov / codecov/patch

src/datachain/cli/commands/datasets.py#L108

Added line #L108 was not covered by tests

project = catalog.metastore.get_project(project_name, namespace_name)
ds = catalog.get_dataset(name, project)
for v in ds.versions:
Expand Down Expand Up @@ -137,9 +135,7 @@
studio: Optional[bool] = False,
team: Optional[str] = None,
):
namespace_name, project_name, name = parse_dataset_name(name)
namespace_name = namespace_name or catalog.metastore.default_namespace_name
project_name = project_name or catalog.metastore.default_project_name
namespace_name, project_name, name = catalog.get_full_dataset_name(name)

if not catalog.metastore.is_local_dataset(namespace_name) and studio:
from datachain.studio import remove_studio_dataset
Expand All @@ -166,9 +162,7 @@
attrs: Optional[list[str]] = None,
team: Optional[str] = None,
):
namespace_name, project_name, name = parse_dataset_name(name)
namespace_name = namespace_name or catalog.metastore.default_namespace_name
project_name = project_name or catalog.metastore.default_project_name
namespace_name, project_name, name = catalog.get_full_dataset_name(name)

if catalog.metastore.is_local_dataset(namespace_name):
try:
Expand Down
15 changes: 13 additions & 2 deletions src/datachain/data_storage/metastore.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ def create_namespace(
description: Optional[str] = None,
uuid: Optional[str] = None,
ignore_if_exists: bool = True,
validate: bool = True,
**kwargs,
) -> Namespace:
"""Creates new namespace"""
Expand Down Expand Up @@ -192,6 +193,7 @@ def create_project(
description: Optional[str] = None,
uuid: Optional[str] = None,
ignore_if_exists: bool = True,
validate: bool = True,
**kwargs,
) -> Project:
"""Creates new project in specific namespace"""
Expand Down Expand Up @@ -725,8 +727,11 @@ def create_namespace(
description: Optional[str] = None,
uuid: Optional[str] = None,
ignore_if_exists: bool = True,
validate: bool = True,
**kwargs,
) -> Namespace:
if validate:
Namespace.validate_name(name)
query = self._namespaces_insert().values(
name=name,
uuid=uuid or str(uuid4()),
Expand Down Expand Up @@ -775,12 +780,15 @@ def create_project(
description: Optional[str] = None,
uuid: Optional[str] = None,
ignore_if_exists: bool = True,
validate: bool = True,
**kwargs,
) -> Project:
if validate:
Project.validate_name(name)
try:
namespace = self.get_namespace(namespace_name)
except NamespaceNotFoundError:
namespace = self.create_namespace(namespace_name)
namespace = self.create_namespace(namespace_name, validate=validate)

query = self._projects_insert().values(
namespace_id=namespace.id,
Expand Down Expand Up @@ -817,11 +825,14 @@ def get_project(
"""Gets a single project inside some namespace by name"""
n = self._namespaces
p = self._projects
validate = True

if self._is_listing_project(name, namespace_name) or self._is_default_project(
name, namespace_name
):
# we are always creating default and listing projects if they don't exist
create = True
validate = False

query = self._projects_select(
*(getattr(n.c, f) for f in self._namespaces_fields),
Expand All @@ -834,7 +845,7 @@ def get_project(
rows = list(self.db.execute(query, conn=conn))
if not rows:
if create:
return self.create_project(namespace_name, name)
return self.create_project(namespace_name, name, validate=validate)
raise ProjectNotFoundError(
f"Project {name} in namespace {namespace_name} not found."
)
Expand Down
8 changes: 6 additions & 2 deletions src/datachain/data_storage/sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,8 +468,12 @@ def _init_namespaces_projects(self) -> None:
be created implicitly though, to keep the same fully qualified name with
Studio dataset.
"""
system_namespace = self.create_namespace(Namespace.system(), "System namespace")
self.create_project(system_namespace.name, Project.listing(), "Listing project")
system_namespace = self.create_namespace(
Namespace.system(), "System namespace", validate=False
)
self.create_project(
system_namespace.name, Project.listing(), "Listing project", validate=False
)

def _check_schema_version(self) -> None:
"""
Expand Down
4 changes: 3 additions & 1 deletion src/datachain/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,10 @@ def create_dataset_uri(
def parse_dataset_name(name: str) -> tuple[Optional[str], Optional[str], str]:
"""Parses dataset name and returns namespace, project and name"""
if not name:
raise ValueError("Name must be defined to parse it")
raise InvalidDatasetNameError("Name must be defined to parse it")
split = name.split(".")
if len(split) > 3:
raise InvalidDatasetNameError(f"Invalid dataset name {name}")
name = split[-1]
project_name = split[-2] if len(split) > 1 else None
namespace_name = split[-3] if len(split) > 2 else None
Expand Down
18 changes: 6 additions & 12 deletions src/datachain/lib/dc/datachain.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from tqdm import tqdm

from datachain import semver
from datachain.dataset import DatasetRecord, parse_dataset_name
from datachain.dataset import DatasetRecord
from datachain.delta import delta_disabled
from datachain.error import ProjectCreateNotAllowedError, ProjectNotFoundError
from datachain.func import literal
Expand Down Expand Up @@ -557,6 +557,7 @@ def save( # type: ignore[override]
update_version: which part of the dataset version to automatically increase.
Available values: `major`, `minor` or `patch`. Default is `patch`.
"""
catalog = self.session.catalog
if version is not None:
semver.validate(version)

Expand All @@ -570,17 +571,10 @@ def save( # type: ignore[override]
" patch"
)

namespace_name, project_name, name = parse_dataset_name(name)

namespace_name = (
namespace_name
or self._settings.namespace
or self.session.catalog.metastore.default_namespace_name
)
project_name = (
project_name
or self._settings.project
or self.session.catalog.metastore.default_project_name
namespace_name, project_name, name = catalog.get_full_dataset_name(
name,
namespace_name=self._settings.namespace,
project_name=self._settings.project,
)

try:
Expand Down
17 changes: 8 additions & 9 deletions src/datachain/lib/dc/datasets.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from collections.abc import Sequence
from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints

from datachain.dataset import parse_dataset_name
from datachain.error import (
DatasetNotFoundError,
DatasetVersionNotFoundError,
Expand Down Expand Up @@ -125,11 +124,11 @@ def read_dataset(
session = Session.get(session)
catalog = session.catalog

namespace_name, project_name, name = parse_dataset_name(name)
namespace_name = (
namespace_name or namespace or catalog.metastore.default_namespace_name
namespace_name, project_name, name = catalog.get_full_dataset_name(
name,
project_name=project,
namespace_name=namespace,
)
project_name = project_name or project or catalog.metastore.default_project_name

if version is not None:
try:
Expand Down Expand Up @@ -320,11 +319,11 @@ def delete_dataset(
session = Session.get(session, in_memory=in_memory)
catalog = session.catalog

namespace_name, project_name, name = parse_dataset_name(name)
namespace_name = (
namespace_name or namespace or catalog.metastore.default_namespace_name
namespace_name, project_name, name = catalog.get_full_dataset_name(
name,
project_name=project,
namespace_name=namespace,
)
project_name = project_name or project or catalog.metastore.default_project_name

if not catalog.metastore.is_local_dataset(namespace_name) and studio:
return remove_studio_dataset(
Expand Down
2 changes: 1 addition & 1 deletion src/datachain/lib/dc/records.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,4 +97,4 @@ def read_records(
for chunk in batched(records, INSERT_BATCH_SIZE):
warehouse.insert_rows(table, chunk)
warehouse.insert_rows_done(table)
return read_dataset(name=dsr.name, session=session, settings=settings)
return read_dataset(name=dsr.full_name, session=session, settings=settings)
Loading
Loading