From fa09d0ba0707c96a5fda85659115095fd4339910 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Wed, 19 Feb 2025 14:21:15 +0100
Subject: [PATCH 01/45] adding incremental update

---
 src/datachain/lib/dc.py      | 57 +++++++++++++++++++++++++++++++++++-
 tests/func/test_datachain.py | 41 ++++++++++++++++++++++++++
 2 files changed, 97 insertions(+), 1 deletion(-)

diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py
index 2b3429baf..0af60169f 100644
--- a/src/datachain/lib/dc.py
+++ b/src/datachain/lib/dc.py
@@ -411,6 +411,7 @@ def from_storage(
         object_name: str = "file",
         update: bool = False,
         anon: bool = False,
+        incremental: bool = False
     ) -> "Self":
         """Get data from a storage as a list of file with all file attributes.
         It returns the chain itself as usual.
@@ -735,7 +736,11 @@ def listings(
         )
 
     def save(  # type: ignore[override]
-        self, name: Optional[str] = None, version: Optional[int] = None, **kwargs
+        self,
+        name: Optional[str] = None,
+        version: Optional[int] = None,
+        incremental: Optional[bool] = False,
+        **kwargs,
     ) -> "Self":
         """Save to a Dataset. It returns the chain itself.
 
@@ -743,8 +748,58 @@ def save(  # type: ignore[override]
             name : dataset name. Empty name saves to a temporary dataset that will be
                 removed after process ends. Temp dataset are useful for optimization.
             version : version of a dataset. Default - the last version that exist.
+            incremental : wheather this is an incremental dataset or not.
         """
         schema = self.signals_schema.clone_without_sys_signals().serialize()
+        if incremental and name:
+            """
+             DataChain
+                .from_storage("s3://bkt/dir1/")
+                .filter(C("file.path").glob("*.jpg"))
+                .map(emb=my_embedding)
+                .save("incremental_ds")
+
+            ->
+             DataChain
+                .from_storage("s3://bkt/dir1/")
+                .diff(
+                    DataChain.from_dataset("incremental_ds", version=3),
+                    on="file", # this should be get from ds feature schema
+                    added=True,
+                    modified=True,
+                )
+                .filter(C("file.path").glob("*.jpg"))
+                .map(emb=my_embedding)
+                .save("incremental_ds")
+
+            """
+            from datachain.error import DatasetNotFoundError
+            try:
+                incremental_ds = self.session.catalog.get_dataset(name)
+                latest_version = incremental_ds.latest_version
+                diff = (
+                    DataChain.from_dataset(
+                        self._query.starting_step.dataset_name,
+                        version=self._query.starting_step.dataset_version
+                    )
+                    .diff(
+                        DataChain.from_dataset(name, version=latest_version),
+                        on="file", # this should be get from ds feature schema
+                        added=True,
+                        modified=True,
+                    )
+                )
+                diff._query.steps += self._query.steps
+                diff = diff.union(DataChain.from_dataset(name, latest_version))
+                return self._evolve(
+                    query=diff._query.save(
+                        name=name, version=version, feature_schema=schema, **kwargs
+                    )
+                )
+            except DatasetNotFoundError:
+                # dataset still doesn't exists so we continue with normal cration
+                pass
+
         return self._evolve(
             query=self._query.save(
                 name=name, version=version, feature_schema=schema, **kwargs
diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py
index a03f542d5..e7f6bd620 100644
--- a/tests/func/test_datachain.py
+++ b/tests/func/test_datachain.py
@@ -1788,3 +1788,44 @@ def func(key: str) -> str:
     for _ in range(4):
         with pytest.raises(Exception, match="Test Error!"):
             dc.map(res=func).exec()
+
+
+def test_incremental_update(test_session, tmp_dir, tmp_path):
+    ds_name = "incremental_ds"
+    images = [
+        {"name": "img1.jpg", "data": Image.new(mode="RGB", size=(64, 64))},
+        {"name": "img2.jpg", "data": Image.new(mode="RGB", size=(128, 128))},
+    ]
+
+    for img in images:
+        img["data"].save(tmp_path / img["name"])
+
+    DataChain.from_values(
+        file=[
+            ImageFile(path=img["name"], source=f"file://{tmp_path}") for img in images
+        ],
+        session=test_session,
+    ).save(ds_name, incremental=True)
+
+    new_images = [
+        {"name": "img3.jpg", "data": Image.new(mode="RGB", size=(64, 64))},
+        {"name": "img4.jpg", "data": Image.new(mode="RGB", size=(128, 128))},
+    ]
+    for img in new_images:
+        img["data"].save(tmp_path / img["name"])
+
+    images += new_images
+
+    DataChain.from_values(
+        file=[
+            ImageFile(path=img["name"], source=f"file://{tmp_path}") for img in images
+        ],
+        session=test_session,
+    ).save(ds_name, incremental=True)
+
+    for im in dc.collect("file"):
+        print(im.path)
+
+    assert 1 == 2
+
+

From 99a532746297de94bc9d114f3441f7efd927deec Mon Sep 17 00:00:00 2001
From: ivan <ilongin@iterative.ai>
Date: Wed, 19 Feb 2025 15:12:57 +0100
Subject: [PATCH 02/45] continued working on incremental

---
 src/datachain/lib/dc.py      |  4 +++-
 tests/func/test_datachain.py | 19 +++++++++++++++++--
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py
index 0af60169f..8f3d9dc9e 100644
--- a/src/datachain/lib/dc.py
+++ b/src/datachain/lib/dc.py
@@ -777,6 +777,8 @@ def save(  # type: ignore[override]
             try:
                 incremental_ds = self.session.catalog.get_dataset(name)
                 latest_version = incremental_ds.latest_version
+                print(f"Starting ds is {self._query.starting_step.dataset_name}")
+                print(f"Starting ds version is {self._query.starting_step.dataset_version}")
                 diff = (
                     DataChain.from_dataset(
                         self._query.starting_step.dataset_name,
@@ -797,7 +799,7 @@ def save(  # type: ignore[override]
                     )
                 )
             except DatasetNotFoundError:
-                # dataset still doesn't exists so we continue with normal cration
+                # dataset doesn't exist yet so we can continue with normal cration
                 pass
 
         return self._evolve(
diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py
index e7f6bd620..809050ed1 100644
--- a/tests/func/test_datachain.py
+++ b/tests/func/test_datachain.py
@@ -1791,6 +1791,7 @@ def func(key: str) -> str:
 
 
 def test_incremental_update(test_session, tmp_dir, tmp_path):
+    starting_ds_name = "starting_ds"
     ds_name = "incremental_ds"
     images = [
         {"name": "img1.jpg", "data": Image.new(mode="RGB", size=(64, 64))},
@@ -1805,6 +1806,10 @@ def test_incremental_update(test_session, tmp_dir, tmp_path):
             ImageFile(path=img["name"], source=f"file://{tmp_path}") for img in images
         ],
         session=test_session,
+    ).save(starting_ds_name)
+
+    DataChain.from_dataset(
+        starting_ds_name, session=test_session,
     ).save(ds_name, incremental=True)
 
     new_images = [
@@ -1815,15 +1820,25 @@ def test_incremental_update(test_session, tmp_dir, tmp_path):
         img["data"].save(tmp_path / img["name"])
 
     images += new_images
-
     DataChain.from_values(
         file=[
             ImageFile(path=img["name"], source=f"file://{tmp_path}") for img in images
         ],
         session=test_session,
+    ).save(starting_ds_name)
+
+    DataChain.from_dataset(
+        starting_ds_name, session=test_session,
     ).save(ds_name, incremental=True)
 
-    for im in dc.collect("file"):
+    dc = DataChain.from_dataset(ds_name)
+
+    print("Images in version 1 are")
+    for im in DataChain.from_dataset(ds_name, version=1).collect("file"):
+        print(im.path)
+
+    print("Images in version 2 are")
+    for im in DataChain.from_dataset(ds_name, version=2).collect("file"):
         print(im.path)
 
     assert 1 == 2

From f01b3a2b11c0c1d3216a09c3c163d1c97c6554f9 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Wed, 19 Feb 2025 16:11:57 +0100
Subject: [PATCH 03/45] finixhed first test

---
 tests/func/test_datachain.py | 77 +++++++++++++++++++-----------------
 1 file changed, 40 insertions(+), 37 deletions(-)

diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py
index 809050ed1..1cd0d2628 100644
--- a/tests/func/test_datachain.py
+++ b/tests/func/test_datachain.py
@@ -1790,57 +1790,60 @@ def func(key: str) -> str:
             dc.map(res=func).exec()
 
 
-def test_incremental_update(test_session, tmp_dir, tmp_path):
+def test_incremental_update_from_dataset(test_session, tmp_dir, tmp_path):
     starting_ds_name = "starting_ds"
     ds_name = "incremental_ds"
+
     images = [
         {"name": "img1.jpg", "data": Image.new(mode="RGB", size=(64, 64))},
         {"name": "img2.jpg", "data": Image.new(mode="RGB", size=(128, 128))},
+        {"name": "img3.jpg", "data": Image.new(mode="RGB", size=(64, 64))},
+        {"name": "img4.jpg", "data": Image.new(mode="RGB", size=(128, 128))},
     ]
 
-    for img in images:
-        img["data"].save(tmp_path / img["name"])
+    def create_image_dataset(ds_name, images):
+        DataChain.from_values(
+            file=[
+                ImageFile(path=img["name"], source=f"file://{tmp_path}")
+                for img in images
+            ],
+            session=test_session,
+        ).save(ds_name)
 
-    DataChain.from_values(
-        file=[
-            ImageFile(path=img["name"], source=f"file://{tmp_path}") for img in images
-        ],
-        session=test_session,
-    ).save(starting_ds_name)
+    # first version of starting dataset
+    create_image_dataset(starting_ds_name, images[:2])
 
+    # first version of incremental dataset
     DataChain.from_dataset(
-        starting_ds_name, session=test_session,
+        starting_ds_name,
+        session=test_session,
     ).save(ds_name, incremental=True)
 
-    new_images = [
-        {"name": "img3.jpg", "data": Image.new(mode="RGB", size=(64, 64))},
-        {"name": "img4.jpg", "data": Image.new(mode="RGB", size=(128, 128))},
-    ]
-    for img in new_images:
-        img["data"].save(tmp_path / img["name"])
-
-    images += new_images
-    DataChain.from_values(
-        file=[
-            ImageFile(path=img["name"], source=f"file://{tmp_path}") for img in images
-        ],
-        session=test_session,
-    ).save(starting_ds_name)
+    # second version of starting dataset
+    create_image_dataset(starting_ds_name, images[2:])
 
+    # second version of incremental dataset
     DataChain.from_dataset(
-        starting_ds_name, session=test_session,
+        starting_ds_name,
+        session=test_session,
     ).save(ds_name, incremental=True)
 
-    dc = DataChain.from_dataset(ds_name)
-
-    print("Images in version 1 are")
-    for im in DataChain.from_dataset(ds_name, version=1).collect("file"):
-        print(im.path)
-
-    print("Images in version 2 are")
-    for im in DataChain.from_dataset(ds_name, version=2).collect("file"):
-        print(im.path)
-
-    assert 1 == 2
-
+    assert list(
+        DataChain.from_dataset(ds_name, version=1)
+        .order_by("file.path")
+        .collect("file.path")
+    ) == [
+        "img1.jpg",
+        "img2.jpg",
+    ]
 
+    assert list(
+        DataChain.from_dataset(ds_name, version=2)
+        .order_by("file.path")
+        .collect("file.path")
+    ) == [
+        "img1.jpg",
+        "img2.jpg",
+        "img3.jpg",
+        "img4.jpg",
+    ]

From 8fa15344a1c447cb566cc0cae69ab0cd398e1571 Mon Sep 17 00:00:00 2001
From: ivan <ilongin@iterative.ai>
Date: Fri, 21 Feb 2025 03:20:10 +0100
Subject: [PATCH 04/45] added from storage incremental update test

---
 tests/func/test_datachain.py | 55 ++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py
index 1cd0d2628..d996944d2 100644
--- a/tests/func/test_datachain.py
+++ b/tests/func/test_datachain.py
@@ -1847,3 +1847,58 @@ def create_image_dataset(ds_name, images):
         "img3.jpg",
         "img4.jpg",
     ]
+
+
+def test_incremental_update_from_storage(test_session, tmp_dir, tmp_path):
+    ds_name = "incremental_ds"
+    images = [
+        {"name": "img1.jpg", "data": Image.new(mode="RGB", size=(64, 64))},
+        {"name": "img2.jpg", "data": Image.new(mode="RGB", size=(128, 128))},
+        {"name": "img3.jpg", "data": Image.new(mode="RGB", size=(64, 64))},
+        {"name": "img4.jpg", "data": Image.new(mode="RGB", size=(128, 128))},
+    ]
+    path = tmp_dir.as_uri()
+    tmp_dir = tmp_dir / "images"
+    os.mkdir(tmp_dir)
+
+    # save only 2 images
+    for img in images[:2]:
+        img["data"].save(tmp_dir / img["name"])
+
+    # first version of incremental dataset
+    DataChain.from_storage(
+        path,
+        update=True,
+        session=test_session,
+    ).save(ds_name, incremental=True)
+
+    # save other 2 images as well
+    for img in images[2:]:
+        img["data"].save(tmp_dir / img["name"])
+
+    # second version of incremental dataset
+    DataChain.from_storage(
+        path,
+        update=True,
+        session=test_session,
+    ).save(ds_name, incremental=True)
+
+    assert list(
+        DataChain.from_dataset(ds_name, version=1)
+        .order_by("file.path")
+        .collect("file.path")
+    ) == [
+        "images/img1.jpg",
+        "images/img2.jpg",
+    ]
+
+    assert list(
+        DataChain.from_dataset(ds_name, version=2)
+        .order_by("file.path")
+        .collect("file.path")
+    ) == [
+        "images/img1.jpg",
+        "images/img2.jpg",
+        "images/img3.jpg",
+        "images/img4.jpg",
+    ]

From 67824e697be949355bd01252ee10aa46b8be553e Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Fri, 21 Feb 2025 14:24:04 +0100
Subject: [PATCH 05/45] refactoring

---
 src/datachain/lib/dc.py        | 57 ++++++++-----------------
 src/datachain/query/dataset.py | 15 +++----
 tests/func/test_datachain.py   | 78 +++++++++++++++++++---------------
 3 files changed, 67 insertions(+), 83 deletions(-)

diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py
index 8f3d9dc9e..0ff0b435b 100644
--- a/src/datachain/lib/dc.py
+++ b/src/datachain/lib/dc.py
@@ -25,6 +25,7 @@
 from sqlalchemy.sql.sqltypes import NullType
 
 from datachain.dataset import DatasetRecord
+from datachain.error import DatasetNotFoundError
 from datachain.func import literal
 from datachain.func.base import Function
 from datachain.func.func import Func
@@ -411,7 +412,7 @@ def from_storage(
         object_name: str = "file",
         update: bool = False,
         anon: bool = False,
-        incremental: bool = False
+        incremental: bool = False,
     ) -> "Self":
         """Get data from a storage as a list of file with all file attributes.
         It returns the chain itself as usual.
@@ -748,58 +749,36 @@ def save(  # type: ignore[override]
             name : dataset name. Empty name saves to a temporary dataset that will be
                 removed after process ends. Temp dataset are useful for optimization.
             version : version of a dataset. Default - the last version that exist.
-            incremental : wheather this is an incremental dataset or not.
+            incremental : whether this is an incremental dataset or not.
         """
         schema = self.signals_schema.clone_without_sys_signals().serialize()
         if incremental and name:
-            """
-             DataChain
-                .from_storage("s3://bkt/dir1/")
-                .filter(C("file.path").glob("*.jpg"))
-                .map(emb=my_embedding)
-                .save("incremental_ds")
-
-            ->
-             DataChain
-                .from_storage("s3://bkt/dir1/")
-                .diff(
-                    DataChain.from_dataset("incremental_ds", version=3),
-                    on="file", # this should be get from ds feature schema
+            try:
+                latest_version = self.session.catalog.get_dataset(name).latest_version
+                source_ds_name = self._query.starting_step.dataset_name
+                source_ds_version = self._query.starting_step.dataset_version
+                diff = DataChain.from_dataset(
+                    source_ds_name, version=source_ds_version
+                ).diff(
+                    DataChain.from_dataset(name, version=latest_version),
+                    on="file",  # TODO this should be taken from ds feature schema
                     added=True,
                     modified=True,
                 )
-                .filter(C("file.path").glob("*.jpg"))
-                .map(emb=my_embedding)
-                .save("incremental_ds")
-
-            """
-            from datachain.error import DatasetNotFoundError
-            try:
-                incremental_ds = self.session.catalog.get_dataset(name)
-                latest_version = incremental_ds.latest_version
-                print(f"Starting ds is {self._query.starting_step.dataset_name}")
-                print(f"Starting ds version is {self._query.starting_step.dataset_version}")
-                diff = (
-                    DataChain.from_dataset(
-                        self._query.starting_step.dataset_name,
-                        version=self._query.starting_step.dataset_version
-                    )
-                    .diff(
-                        DataChain.from_dataset(name, version=latest_version),
-                        on="file", # this should be get from ds feature schema
-                        added=True,
-                        modified=True,
-                    )
-                )
+                # we append all the steps from original chain to diff dataset,
+                # e.g filters, mappers, mutates etc.
                 diff._query.steps += self._query.steps
+
+                # merging diff and latest version of our dataset chains
                 diff = diff.union(DataChain.from_dataset(name, latest_version))
+
                 return self._evolve(
                     query=diff._query.save(
                         name=name, version=version, feature_schema=schema, **kwargs
                     )
                 )
             except DatasetNotFoundError:
-                # dataset doesn't exist yet so we can continue with normal cration
+                # dataset doesn't exist yet so we can continue with normal flow
                 pass
 
         return self._evolve(
diff --git a/src/datachain/query/dataset.py b/src/datachain/query/dataset.py
index 3b0eb420e..0093bcd0d 100644
--- a/src/datachain/query/dataset.py
+++ b/src/datachain/query/dataset.py
@@ -153,13 +153,6 @@ def step_result(
     )
 
 
-class StartingStep(ABC):
-    """An initial query processing step, referencing a data source."""
-
-    @abstractmethod
-    def apply(self) -> "StepResult": ...
-
-
 @frozen
 class Step(ABC):
     """A query processing step (filtering, mutation, etc.)"""
@@ -172,12 +165,14 @@ def apply(
 
 
 @frozen
-class QueryStep(StartingStep):
+class QueryStep:
+    """A query that returns all rows from specific dataset version"""
+
     catalog: "Catalog"
     dataset_name: str
     dataset_version: int
 
-    def apply(self):
+    def apply(self) -> "StepResult":
         def q(*columns):
             return sqlalchemy.select(*columns)
 
@@ -1095,7 +1090,7 @@ def __init__(
         self.temp_table_names: list[str] = []
         self.dependencies: set[DatasetDependencyType] = set()
         self.table = self.get_table()
-        self.starting_step: StartingStep
+        self.starting_step: QueryStep
         self.name: Optional[str] = None
         self.version: Optional[int] = None
         self.feature_schema: Optional[dict] = None
diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py
index d996944d2..7c3cc0bee 100644
--- a/tests/func/test_datachain.py
+++ b/tests/func/test_datachain.py
@@ -1810,23 +1810,20 @@ def create_image_dataset(ds_name, images):
             session=test_session,
         ).save(ds_name)
 
+    def create_incremental_dataset(ds_name):
+        DataChain.from_dataset(
+            starting_ds_name,
+            session=test_session,
+        ).save(ds_name, incremental=True)
+
     # first version of starting dataset
     create_image_dataset(starting_ds_name, images[:2])
-
     # first version of incremental dataset
-    DataChain.from_dataset(
-        starting_ds_name,
-        session=test_session,
-    ).save(ds_name, incremental=True)
-
+    create_incremental_dataset(ds_name)
     # second version of starting dataset
     create_image_dataset(starting_ds_name, images[2:])
-
     # second version of incremental dataset
-    DataChain.from_dataset(
-        starting_ds_name,
-        session=test_session,
-    ).save(ds_name, incremental=True)
+    create_incremental_dataset(ds_name)
 
     assert list(
         DataChain.from_dataset(ds_name, version=1)
@@ -1851,45 +1848,54 @@ def create_image_dataset(ds_name, images):
 
 def test_incremental_update_from_storage(test_session, tmp_dir, tmp_path):
     ds_name = "incremental_ds"
-    images = [
-        {"name": "img1.jpg", "data": Image.new(mode="RGB", size=(64, 64))},
-        {"name": "img2.jpg", "data": Image.new(mode="RGB", size=(128, 128))},
-        {"name": "img3.jpg", "data": Image.new(mode="RGB", size=(64, 64))},
-        {"name": "img4.jpg", "data": Image.new(mode="RGB", size=(128, 128))},
-    ]
     path = tmp_dir.as_uri()
     tmp_dir = tmp_dir / "images"
     os.mkdir(tmp_dir)
 
-    # save only 2 images
-    for img in images[:2]:
+    images = [
+        {
+            "name": f"img{i}.{'jpg' if i % 2 == 0 else 'png'}",
+            "data": Image.new(mode="RGB", size=((i + 1) * 10, (i + 1) * 10)),
+        }
+        for i in range(20)
+    ]
+
+    # save only half of the images for now
+    for img in images[:10]:
         img["data"].save(tmp_dir / img["name"])
 
+    def create_incremental_dataset():
+        def my_embedding(file: File) -> list[float]:
+            return [0.5, 0.5]
+
+        (
+            DataChain.from_storage(path, update=True, session=test_session)
+            .filter(C("file.path").glob("*.jpg"))
+            .map(emb=my_embedding)
+            .mutate(dist=func.cosine_distance("emb", (0.1, 0.2)))
+            .filter(C("file.size") % 10 < 5)
+            .save(ds_name, incremental=True)
+        )
+
     # first version of incremental dataset
-    DataChain.from_storage(
-        path,
-        update=True,
-        session=test_session,
-    ).save(ds_name, incremental=True)
+    create_incremental_dataset()
 
-    # save other 2 images as well
-    for img in images[2:]:
+    # save other half of images as well
+    for img in images[10:]:
         img["data"].save(tmp_dir / img["name"])
 
     # second version of incremental dataset
-    DataChain.from_storage(
-        path,
-        update=True,
-        session=test_session,
-    ).save(ds_name, incremental=True)
+    create_incremental_dataset()
 
     assert list(
         DataChain.from_dataset(ds_name, version=1)
         .order_by("file.path")
         .collect("file.path")
     ) == [
-        "images/img1.jpg",
+        "images/img0.jpg",
         "images/img2.jpg",
+        "images/img4.jpg",
+        "images/img8.jpg",
     ]
 
     assert list(
@@ -1897,8 +1903,12 @@ def test_incremental_update_from_storage(test_session, tmp_dir, tmp_path):
         .order_by("file.path")
         .collect("file.path")
     ) == [
-        "images/img1.jpg",
+        "images/img0.jpg",
+        "images/img10.jpg",
+        "images/img12.jpg",
+        "images/img16.jpg",
+        "images/img18.jpg",
         "images/img2.jpg",
-        "images/img3.jpg",
         "images/img4.jpg",
+        "images/img8.jpg",
     ]

From ee6640d54eda867d54757cd0e339c58d798cd15a Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Mon, 24 Feb 2025 15:36:10 +0100
Subject: [PATCH 06/45] using delta instead of incremental

---
 src/datachain/lib/dc.py      |  7 +++----
 tests/func/test_datachain.py | 32 ++++++++++++++++----------------
 2 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py
index 599a1c708..5032ddb69 100644
--- a/src/datachain/lib/dc.py
+++ b/src/datachain/lib/dc.py
@@ -412,7 +412,6 @@ def from_storage(
         object_name: str = "file",
         update: bool = False,
         anon: bool = False,
-        incremental: bool = False,
         client_config: Optional[dict] = None,
     ) -> "Self":
         """Get data from a storage as a list of file with all file attributes.
@@ -758,7 +757,7 @@ def save(  # type: ignore[override]
         self,
         name: Optional[str] = None,
         version: Optional[int] = None,
-        incremental: Optional[bool] = False,
+        delta: Optional[bool] = False,
         **kwargs,
     ) -> "Self":
         """Save to a Dataset. It returns the chain itself.
@@ -767,10 +766,10 @@ def save(  # type: ignore[override]
             name : dataset name. Empty name saves to a temporary dataset that will be
                 removed after process ends. Temp dataset are useful for optimization.
             version : version of a dataset. Default - the last version that exist.
-            incremental : whether this is an incremental dataset or not.
+            delta : whether this is an delta dataset or not.
         """
         schema = self.signals_schema.clone_without_sys_signals().serialize()
-        if incremental and name:
+        if delta and name:
             try:
                 latest_version = self.session.catalog.get_dataset(name).latest_version
                 source_ds_name = self._query.starting_step.dataset_name
diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py
index 403d85b23..18ad7eee1 100644
--- a/tests/func/test_datachain.py
+++ b/tests/func/test_datachain.py
@@ -1800,9 +1800,9 @@ def func(key: str) -> str:
             dc.map(res=func).exec()
 
 
-def test_incremental_update_from_dataset(test_session, tmp_dir, tmp_path):
+def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path):
     starting_ds_name = "starting_ds"
-    ds_name = "incremental_ds"
+    ds_name = "delta_ds"
 
     images = [
         {"name": "img1.jpg", "data": Image.new(mode="RGB", size=(64, 64))},
@@ -1820,20 +1820,20 @@ def create_image_dataset(ds_name, images):
             session=test_session,
         ).save(ds_name)
 
-    def create_incremental_dataset(ds_name):
+    def create_delta_dataset(ds_name):
         DataChain.from_dataset(
             starting_ds_name,
             session=test_session,
-        ).save(ds_name, incremental=True)
+        ).save(ds_name, delta=True)
 
     # first version of starting dataset
     create_image_dataset(starting_ds_name, images[:2])
-    # first version of incremental dataset
-    create_incremental_dataset(ds_name)
+    # first version of delta dataset
+    create_delta_dataset(ds_name)
     # second version of starting dataset
     create_image_dataset(starting_ds_name, images[2:])
-    # second version of incremental dataset
-    create_incremental_dataset(ds_name)
+    # second version of delta dataset
+    create_delta_dataset(ds_name)
 
     assert list(
         DataChain.from_dataset(ds_name, version=1)
@@ -1856,8 +1856,8 @@ def create_incremental_dataset(ds_name):
     ]
 
 
-def test_incremental_update_from_storage(test_session, tmp_dir, tmp_path):
-    ds_name = "incremental_ds"
+def test_delta_update_from_storage(test_session, tmp_dir, tmp_path):
+    ds_name = "delta_ds"
     path = tmp_dir.as_uri()
     tmp_dir = tmp_dir / "images"
     os.mkdir(tmp_dir)
@@ -1874,7 +1874,7 @@ def test_incremental_update_from_storage(test_session, tmp_dir, tmp_path):
     for img in images[:10]:
         img["data"].save(tmp_dir / img["name"])
 
-    def create_incremental_dataset():
+    def create_delta_dataset():
         def my_embedding(file: File) -> list[float]:
             return [0.5, 0.5]
 
@@ -1884,18 +1884,18 @@ def my_embedding(file: File) -> list[float]:
             .map(emb=my_embedding)
             .mutate(dist=func.cosine_distance("emb", (0.1, 0.2)))
             .filter(C("file.size") % 10 < 5)
-            .save(ds_name, incremental=True)
+            .save(ds_name, delta=True)
         )
 
-    # first version of incremental dataset
-    create_incremental_dataset()
+    # first version of delta dataset
+    create_delta_dataset()
 
     # save other half of images as well
     for img in images[10:]:
         img["data"].save(tmp_dir / img["name"])
 
-    # second version of incremental dataset
-    create_incremental_dataset()
+    # second version of delta dataset
+    create_delta_dataset()
 
     assert list(
         DataChain.from_dataset(ds_name, version=1)

From 5e446b58022f8289780c4cd24a5c5ab135fa46de Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Tue, 25 Feb 2025 00:27:31 +0100
Subject: [PATCH 07/45] added check for modification

---
 tests/func/test_datachain.py | 41 ++++++++++++++++++++++++++++++------
 1 file changed, 34 insertions(+), 7 deletions(-)

diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py
index 18ad7eee1..aa632057c 100644
--- a/tests/func/test_datachain.py
+++ b/tests/func/test_datachain.py
@@ -1878,20 +1878,36 @@ def create_delta_dataset():
         def my_embedding(file: File) -> list[float]:
             return [0.5, 0.5]
 
+        def get_index(file: File) -> int:
+            r = r".+\/img(\d+)\.jpg"
+            return int(re.search(r, file.path).group(1))  # type: ignore[union-attr]
+
         (
             DataChain.from_storage(path, update=True, session=test_session)
             .filter(C("file.path").glob("*.jpg"))
             .map(emb=my_embedding)
             .mutate(dist=func.cosine_distance("emb", (0.1, 0.2)))
-            .filter(C("file.size") % 10 < 5)
+            .map(index=get_index)
+            .filter(C("index") > 3)
             .save(ds_name, delta=True)
         )
 
     # first version of delta dataset
     create_delta_dataset()
 
-    # save other half of images as well
-    for img in images[10:]:
+    # remember old etags for later comparison to prove modified images are also taken
+    # into consideration on delta update
+    etags = {
+        r[0]: r[1].etag
+        for r in DataChain.from_dataset(ds_name, version=1).collect("index", "file")
+    }
+
+    # remove last couple of images to simulate modification since we will re-create it
+    for img in images[5:10]:
+        os.remove(tmp_dir / img["name"])
+
+    # save other half of images and the ones that are removed above
+    for img in images[5:]:
         img["data"].save(tmp_dir / img["name"])
 
     # second version of delta dataset
@@ -1902,9 +1918,8 @@ def my_embedding(file: File) -> list[float]:
         .order_by("file.path")
         .collect("file.path")
     ) == [
-        "images/img0.jpg",
-        "images/img2.jpg",
         "images/img4.jpg",
+        "images/img6.jpg",
         "images/img8.jpg",
     ]
 
@@ -1913,12 +1928,24 @@ def my_embedding(file: File) -> list[float]:
         .order_by("file.path")
         .collect("file.path")
     ) == [
-        "images/img0.jpg",
         "images/img10.jpg",
         "images/img12.jpg",
+        "images/img14.jpg",
         "images/img16.jpg",
         "images/img18.jpg",
-        "images/img2.jpg",
         "images/img4.jpg",
+        "images/img6.jpg",
+        "images/img6.jpg",
+        "images/img8.jpg",
         "images/img8.jpg",
     ]
+
+    # check that we have both old and new version of those that are modified
+    rows = list(
+        DataChain.from_dataset(ds_name, version=2)
+        .filter(C("index") == 6)
+        .order_by("file.path", "file.etag")
+        .collect("file")
+    )
+    assert rows[0].etag == etags[6]
+    assert rows[1].etag > etags[6]  # new etag is bigger as it's the value of mtime

From 71c3469f3130aeb57521746861412203c0a12ab8 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Tue, 25 Feb 2025 01:25:32 +0100
Subject: [PATCH 08/45] added another test

---
 tests/func/test_datachain.py | 50 ++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py
index aa632057c..930be6ac9 100644
--- a/tests/func/test_datachain.py
+++ b/tests/func/test_datachain.py
@@ -1949,3 +1949,53 @@ def get_index(file: File) -> int:
     )
     assert rows[0].etag == etags[6]
     assert rows[1].etag > etags[6]  # new etag is bigger as it's the value of mtime
+
+
+def test_delta_update_no_diff(test_session, tmp_dir, tmp_path):
+    ds_name = "delta_ds"
+    path = tmp_dir.as_uri()
+    tmp_dir = tmp_dir / "images"
+    os.mkdir(tmp_dir)
+
+    images = [
+        {"name": f"img{i}.jpg", "data": Image.new(mode="RGB", size=(64, 128))}
+        for i in range(10)
+    ]
+
+    for img in images:
+        img["data"].save(tmp_dir / img["name"])
+
+    def create_delta_dataset():
+        def get_index(file: File) -> int:
+            r = r".+\/img(\d+)\.jpg"
+            return int(re.search(r, file.path).group(1))  # type: ignore[union-attr]
+
+        (
+            DataChain.from_storage(path, update=True, session=test_session)
+            .filter(C("file.path").glob("*.jpg"))
+            .map(index=get_index)
+            .filter(C("index") > 5)
+            .save(ds_name, delta=True)
+        )
+
+    create_delta_dataset()
+    create_delta_dataset()
+
+    assert (
+        list(
+            DataChain.from_dataset(ds_name, version=1)
+            .order_by("file.path")
+            .collect("file.path")
+        )
+        == list(
+            DataChain.from_dataset(ds_name, version=2)
+            .order_by("file.path")
+            .collect("file.path")
+        )
+        == [
+            "images/img6.jpg",
+            "images/img7.jpg",
+            "images/img8.jpg",
+            "images/img9.jpg",
+        ]
+    )

From 83366aa2b646cf4eb752aa55ede5668757beb519 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Tue, 25 Feb 2025 13:26:04 +0100
Subject: [PATCH 09/45] refactoring

---
 src/datachain/delta.py               | 30 ++++++++++++++++++++++++++++
 src/datachain/lib/dc.py              | 29 ++++-----------------------
 src/datachain/lib/signal_schema.py   | 11 +++++-----
 tests/func/test_datachain.py         | 16 +++++++++++++++
 tests/unit/lib/test_signal_schema.py |  5 +++++
 5 files changed, 60 insertions(+), 31 deletions(-)
 create mode 100644 src/datachain/delta.py

diff --git a/src/datachain/delta.py b/src/datachain/delta.py
new file mode 100644
index 000000000..809803836
--- /dev/null
+++ b/src/datachain/delta.py
@@ -0,0 +1,30 @@
+from typing import TYPE_CHECKING, Optional
+
+from datachain.error import DatasetNotFoundError
+
+if TYPE_CHECKING:
+    from datachain.lib.dc import DataChain
+
+
+def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]:
+    from datachain.lib.dc import DataChain
+
+    file_signal = dc.signals_schema.get_file_signal()
+    if not file_signal:
+        raise ValueError("Datasets without file signals cannot have delta updates")
+    try:
+        latest_version = dc.session.catalog.get_dataset(name).latest_version
+    except DatasetNotFoundError:
+        return None
+
+    source_ds_name = dc._query.starting_step.dataset_name
+    source_ds_version = dc._query.starting_step.dataset_version
+    diff = DataChain.from_dataset(source_ds_name, version=source_ds_version).diff(
+        DataChain.from_dataset(name, version=latest_version), on=file_signal
+    )
+    # we append all the steps from original chain to diff,
+    # e.g filters, mappers, generators etc.
+    diff._query.steps += dc._query.steps
+
+    # merging diff and latest version of our dataset chains
+    return diff.union(DataChain.from_dataset(name, latest_version))
diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py
index 5032ddb69..f5e0f0168 100644
--- a/src/datachain/lib/dc.py
+++ b/src/datachain/lib/dc.py
@@ -25,7 +25,7 @@
 from sqlalchemy.sql.sqltypes import NullType
 
 from datachain.dataset import DatasetRecord
-from datachain.error import DatasetNotFoundError
+from datachain.delta import delta_update
 from datachain.func import literal
 from datachain.func.base import Function
 from datachain.func.func import Func
@@ -770,34 +770,13 @@ def save(  # type: ignore[override]
         """
         schema = self.signals_schema.clone_without_sys_signals().serialize()
         if delta and name:
-            try:
-                latest_version = self.session.catalog.get_dataset(name).latest_version
-                source_ds_name = self._query.starting_step.dataset_name
-                source_ds_version = self._query.starting_step.dataset_version
-                diff = DataChain.from_dataset(
-                    source_ds_name, version=source_ds_version
-                ).diff(
-                    DataChain.from_dataset(name, version=latest_version),
-                    on="file",  # TODO this should be taken from ds feature schema
-                    added=True,
-                    modified=True,
-                )
-                # we append all the steps from original chain to diff dataset,
-                # e.g filters, mappers, mutates etc.
-                diff._query.steps += self._query.steps
-
-                # merging diff and latest version of our dataset chains
-                diff = diff.union(DataChain.from_dataset(name, latest_version))
-
+            delta_ds = delta_update(self, name)
+            if delta_ds:
                 return self._evolve(
-                    query=diff._query.save(
+                    query=delta_ds._query.save(
                         name=name, version=version, feature_schema=schema, **kwargs
                     )
                 )
-            except DatasetNotFoundError:
-                # dataset doesn't exist yet so we can continue with normal flow
-                pass
-
         return self._evolve(
             query=self._query.save(
                 name=name, version=version, feature_schema=schema, **kwargs
diff --git a/src/datachain/lib/signal_schema.py b/src/datachain/lib/signal_schema.py
index d723c5b8d..17c8a8b4b 100644
--- a/src/datachain/lib/signal_schema.py
+++ b/src/datachain/lib/signal_schema.py
@@ -410,14 +410,13 @@ def row_to_objs(self, row: Sequence[Any]) -> list[DataValue]:
                 pos += 1
         return objs
 
-    def contains_file(self) -> bool:
-        for type_ in self.values.values():
-            if (fr := ModelStore.to_pydantic(type_)) is not None and issubclass(
+    def get_file_signal(self) -> Optional[str]:
+        for signal_name, signal_type in self.values.items():
+            if (fr := ModelStore.to_pydantic(signal_type)) is not None and issubclass(
                 fr, File
             ):
-                return True
-
-        return False
+                return signal_name
+        return None
 
     def slice(
         self, keys: Sequence[str], setup: Optional[dict[str, Callable]] = None
diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py
index 930be6ac9..5ac3a7912 100644
--- a/tests/func/test_datachain.py
+++ b/tests/func/test_datachain.py
@@ -1999,3 +1999,19 @@ def get_index(file: File) -> int:
             "images/img9.jpg",
         ]
     )
+
+
+def test_delta_update_no_file_signals(test_session):
+    starting_ds_name = "starting_ds"
+
+    DataChain.from_values(num=[10, 20], session=test_session).save(starting_ds_name)
+
+    with pytest.raises(ValueError) as excinfo:
+        DataChain.from_dataset(
+            starting_ds_name,
+            session=test_session,
+        ).save("delta_ds", delta=True)
+
+    assert (
+        str(excinfo.value) == "Datasets without file signals cannot have delta updates"
+    )
diff --git a/tests/unit/lib/test_signal_schema.py b/tests/unit/lib/test_signal_schema.py
index cef421b5d..dc00dbf9b 100644
--- a/tests/unit/lib/test_signal_schema.py
+++ b/tests/unit/lib/test_signal_schema.py
@@ -992,3 +992,8 @@ def test_column_types(column_type, signal_type):
 
     assert len(signals) == 1
     assert signals["val"] is signal_type
+
+
+def test_get_file_signal():
+    assert SignalSchema({"name": str, "f": File}).get_file_signal() == "f"
+    assert SignalSchema({"name": str}).get_file_signal() is None

From a22916c1c54172c65c536d5a53f3530f3b987816 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Tue, 25 Feb 2025 15:59:15 +0100
Subject: [PATCH 10/45] added comment

---
 src/datachain/delta.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/datachain/delta.py b/src/datachain/delta.py
index 809803836..1b1406e3b 100644
--- a/src/datachain/delta.py
+++ b/src/datachain/delta.py
@@ -7,6 +7,13 @@
 
 
 def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]:
+    """
+    Creates new chain that consists of the last version of current delta dataset
+    plus diff from the source with all needed modifications.
+    This way we don't need to re-calculate the whole chain from the source again(
+    apply all the DataChain methods like filters, mappers, generators etc.)
+    but just the diff part which is very important for performance.
+    """
     from datachain.lib.dc import DataChain
 
     file_signal = dc.signals_schema.get_file_signal()
@@ -15,6 +22,7 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]:
     try:
         latest_version = dc.session.catalog.get_dataset(name).latest_version
     except DatasetNotFoundError:
+        # first creation of delta update dataset
         return None
 
     source_ds_name = dc._query.starting_step.dataset_name
@@ -22,9 +30,10 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]:
     diff = DataChain.from_dataset(source_ds_name, version=source_ds_version).diff(
         DataChain.from_dataset(name, version=latest_version), on=file_signal
     )
-    # we append all the steps from original chain to diff,
-    # e.g filters, mappers, generators etc.
+    # we append all the steps from the original chain to diff,
+    # e.g filters, mappers, generators etc. With this we make sure we add all
+    # needed modifications to diff part as well
     diff._query.steps += dc._query.steps
 
-    # merging diff and latest version of our dataset chains
+    # merging diff and the latest version of our dataset
     return diff.union(DataChain.from_dataset(name, latest_version))

From d9e4f26a54abb46dfe633facf4f86927899454e6 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Tue, 25 Feb 2025 16:06:42 +0100
Subject: [PATCH 11/45] split tests in new file

---
 tests/func/test_datachain.py | 217 ---------------------------------
 tests/func/test_delta.py     | 226 +++++++++++++++++++++++++++++++++++
 2 files changed, 226 insertions(+), 217 deletions(-)
 create mode 100644 tests/func/test_delta.py

diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py
index 5ac3a7912..8e4599911 100644
--- a/tests/func/test_datachain.py
+++ b/tests/func/test_datachain.py
@@ -1798,220 +1798,3 @@ def func(key: str) -> str:
     for _ in range(4):
         with pytest.raises(Exception, match="Test Error!"):
             dc.map(res=func).exec()
-
-
-def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path):
-    starting_ds_name = "starting_ds"
-    ds_name = "delta_ds"
-
-    images = [
-        {"name": "img1.jpg", "data": Image.new(mode="RGB", size=(64, 64))},
-        {"name": "img2.jpg", "data": Image.new(mode="RGB", size=(128, 128))},
-        {"name": "img3.jpg", "data": Image.new(mode="RGB", size=(64, 64))},
-        {"name": "img4.jpg", "data": Image.new(mode="RGB", size=(128, 128))},
-    ]
-
-    def create_image_dataset(ds_name, images):
-        DataChain.from_values(
-            file=[
-                ImageFile(path=img["name"], source=f"file://{tmp_path}")
-                for img in images
-            ],
-            session=test_session,
-        ).save(ds_name)
-
-    def create_delta_dataset(ds_name):
-        DataChain.from_dataset(
-            starting_ds_name,
-            session=test_session,
-        ).save(ds_name, delta=True)
-
-    # first version of starting dataset
-    create_image_dataset(starting_ds_name, images[:2])
-    # first version of delta dataset
-    create_delta_dataset(ds_name)
-    # second version of starting dataset
-    create_image_dataset(starting_ds_name, images[2:])
-    # second version of delta dataset
-    create_delta_dataset(ds_name)
-
-    assert list(
-        DataChain.from_dataset(ds_name, version=1)
-        .order_by("file.path")
-        .collect("file.path")
-    ) == [
-        "img1.jpg",
-        "img2.jpg",
-    ]
-
-    assert list(
-        DataChain.from_dataset(ds_name, version=2)
-        .order_by("file.path")
-        .collect("file.path")
-    ) == [
-        "img1.jpg",
-        "img2.jpg",
-        "img3.jpg",
-        "img4.jpg",
-    ]
-
-
-def test_delta_update_from_storage(test_session, tmp_dir, tmp_path):
-    ds_name = "delta_ds"
-    path = tmp_dir.as_uri()
-    tmp_dir = tmp_dir / "images"
-    os.mkdir(tmp_dir)
-
-    images = [
-        {
-            "name": f"img{i}.{'jpg' if i % 2 == 0 else 'png'}",
-            "data": Image.new(mode="RGB", size=((i + 1) * 10, (i + 1) * 10)),
-        }
-        for i in range(20)
-    ]
-
-    # save only half of the images for now
-    for img in images[:10]:
-        img["data"].save(tmp_dir / img["name"])
-
-    def create_delta_dataset():
-        def my_embedding(file: File) -> list[float]:
-            return [0.5, 0.5]
-
-        def get_index(file: File) -> int:
-            r = r".+\/img(\d+)\.jpg"
-            return int(re.search(r, file.path).group(1))  # type: ignore[union-attr]
-
-        (
-            DataChain.from_storage(path, update=True, session=test_session)
-            .filter(C("file.path").glob("*.jpg"))
-            .map(emb=my_embedding)
-            .mutate(dist=func.cosine_distance("emb", (0.1, 0.2)))
-            .map(index=get_index)
-            .filter(C("index") > 3)
-            .save(ds_name, delta=True)
-        )
-
-    # first version of delta dataset
-    create_delta_dataset()
-
-    # remember old etags for later comparison to prove modified images are also taken
-    # into consideration on delta update
-    etags = {
-        r[0]: r[1].etag
-        for r in DataChain.from_dataset(ds_name, version=1).collect("index", "file")
-    }
-
-    # remove last couple of images to simulate modification since we will re-create it
-    for img in images[5:10]:
-        os.remove(tmp_dir / img["name"])
-
-    # save other half of images and the ones that are removed above
-    for img in images[5:]:
-        img["data"].save(tmp_dir / img["name"])
-
-    # second version of delta dataset
-    create_delta_dataset()
-
-    assert list(
-        DataChain.from_dataset(ds_name, version=1)
-        .order_by("file.path")
-        .collect("file.path")
-    ) == [
-        "images/img4.jpg",
-        "images/img6.jpg",
-        "images/img8.jpg",
-    ]
-
-    assert list(
-        DataChain.from_dataset(ds_name, version=2)
-        .order_by("file.path")
-        .collect("file.path")
-    ) == [
-        "images/img10.jpg",
-        "images/img12.jpg",
-        "images/img14.jpg",
-        "images/img16.jpg",
-        "images/img18.jpg",
-        "images/img4.jpg",
-        "images/img6.jpg",
-        "images/img6.jpg",
-        "images/img8.jpg",
-        "images/img8.jpg",
-    ]
-
-    # check that we have both old and new version of those that are modified
-    rows = list(
-        DataChain.from_dataset(ds_name, version=2)
-        .filter(C("index") == 6)
-        .order_by("file.path", "file.etag")
-        .collect("file")
-    )
-    assert rows[0].etag == etags[6]
-    assert rows[1].etag > etags[6]  # new etag is bigger as it's the value of mtime
-
-
-def test_delta_update_no_diff(test_session, tmp_dir, tmp_path):
-    ds_name = "delta_ds"
-    path = tmp_dir.as_uri()
-    tmp_dir = tmp_dir / "images"
-    os.mkdir(tmp_dir)
-
-    images = [
-        {"name": f"img{i}.jpg", "data": Image.new(mode="RGB", size=(64, 128))}
-        for i in range(10)
-    ]
-
-    for img in images:
-        img["data"].save(tmp_dir / img["name"])
-
-    def create_delta_dataset():
-        def get_index(file: File) -> int:
-            r = r".+\/img(\d+)\.jpg"
-            return int(re.search(r, file.path).group(1))  # type: ignore[union-attr]
-
-        (
-            DataChain.from_storage(path, update=True, session=test_session)
-            .filter(C("file.path").glob("*.jpg"))
-            .map(index=get_index)
-            .filter(C("index") > 5)
-            .save(ds_name, delta=True)
-        )
-
-    create_delta_dataset()
-    create_delta_dataset()
-
-    assert (
-        list(
-            DataChain.from_dataset(ds_name, version=1)
-            .order_by("file.path")
-            .collect("file.path")
-        )
-        == list(
-            DataChain.from_dataset(ds_name, version=2)
-            .order_by("file.path")
-            .collect("file.path")
-        )
-        == [
-            "images/img6.jpg",
-            "images/img7.jpg",
-            "images/img8.jpg",
-            "images/img9.jpg",
-        ]
-    )
-
-
-def test_delta_update_no_file_signals(test_session):
-    starting_ds_name = "starting_ds"
-
-    DataChain.from_values(num=[10, 20], session=test_session).save(starting_ds_name)
-
-    with pytest.raises(ValueError) as excinfo:
-        DataChain.from_dataset(
-            starting_ds_name,
-            session=test_session,
-        ).save("delta_ds", delta=True)
-
-    assert (
-        str(excinfo.value) == "Datasets without file signals cannot have delta updates"
-    )
diff --git a/tests/func/test_delta.py b/tests/func/test_delta.py
new file mode 100644
index 000000000..1b5f67784
--- /dev/null
+++ b/tests/func/test_delta.py
@@ -0,0 +1,226 @@
+import os
+
+import pytest
+import regex as re
+from PIL import Image
+
+from datachain import func
+from datachain.lib.dc import C, DataChain
+from datachain.lib.file import File, ImageFile
+
+
+def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path):
+    starting_ds_name = "starting_ds"
+    ds_name = "delta_ds"
+
+    images = [
+        {"name": "img1.jpg", "data": Image.new(mode="RGB", size=(64, 64))},
+        {"name": "img2.jpg", "data": Image.new(mode="RGB", size=(128, 128))},
+        {"name": "img3.jpg", "data": Image.new(mode="RGB", size=(64, 64))},
+        {"name": "img4.jpg", "data": Image.new(mode="RGB", size=(128, 128))},
+    ]
+
+    def create_image_dataset(ds_name, images):
+        DataChain.from_values(
+            file=[
+                ImageFile(path=img["name"], source=f"file://{tmp_path}")
+                for img in images
+            ],
+            session=test_session,
+        ).save(ds_name)
+
+    def create_delta_dataset(ds_name):
+        DataChain.from_dataset(
+            starting_ds_name,
+            session=test_session,
+        ).save(ds_name, delta=True)
+
+    # first version of starting dataset
+    create_image_dataset(starting_ds_name, images[:2])
+    # first version of delta dataset
+    create_delta_dataset(ds_name)
+    # second version of starting dataset
+    create_image_dataset(starting_ds_name, images[2:])
+    # second version of delta dataset
+    create_delta_dataset(ds_name)
+
+    assert list(
+        DataChain.from_dataset(ds_name, version=1)
+        .order_by("file.path")
+        .collect("file.path")
+    ) == [
+        "img1.jpg",
+        "img2.jpg",
+    ]
+
+    assert list(
+        DataChain.from_dataset(ds_name, version=2)
+        .order_by("file.path")
+        .collect("file.path")
+    ) == [
+        "img1.jpg",
+        "img2.jpg",
+        "img3.jpg",
+        "img4.jpg",
+    ]
+
+
+def test_delta_update_from_storage(test_session, tmp_dir, tmp_path):
+    ds_name = "delta_ds"
+    path = tmp_dir.as_uri()
+    tmp_dir = tmp_dir / "images"
+    os.mkdir(tmp_dir)
+
+    images = [
+        {
+            "name": f"img{i}.{'jpg' if i % 2 == 0 else 'png'}",
+            "data": Image.new(mode="RGB", size=((i + 1) * 10, (i + 1) * 10)),
+        }
+        for i in range(20)
+    ]
+
+    # save only half of the images for now
+    for img in images[:10]:
+        img["data"].save(tmp_dir / img["name"])
+
+    def create_delta_dataset():
+        def my_embedding(file: File) -> list[float]:
+            return [0.5, 0.5]
+
+        def get_index(file: File) -> int:
+            r = r".+\/img(\d+)\.jpg"
+            return int(re.search(r, file.path).group(1))  # type: ignore[union-attr]
+
+        (
+            DataChain.from_storage(path, update=True, session=test_session)
+            .filter(C("file.path").glob("*.jpg"))
+            .map(emb=my_embedding)
+            .mutate(dist=func.cosine_distance("emb", (0.1, 0.2)))
+            .map(index=get_index)
+            .filter(C("index") > 3)
+            .save(ds_name, delta=True)
+        )
+
+    # first version of delta dataset
+    create_delta_dataset()
+
+    # remember old etags for later comparison to prove modified images are also taken
+    # into consideration on delta update
+    etags = {
+        r[0]: r[1].etag
+        for r in DataChain.from_dataset(ds_name, version=1).collect("index", "file")
+    }
+
+    # remove last couple of images to simulate modification since we will re-create it
+    for img in images[5:10]:
+        os.remove(tmp_dir / img["name"])
+
+    # save other half of images and the ones that are removed above
+    for img in images[5:]:
+        img["data"].save(tmp_dir / img["name"])
+
+    # second version of delta dataset
+    create_delta_dataset()
+
+    assert list(
+        DataChain.from_dataset(ds_name, version=1)
+        .order_by("file.path")
+        .collect("file.path")
+    ) == [
+        "images/img4.jpg",
+        "images/img6.jpg",
+        "images/img8.jpg",
+    ]
+
+    assert list(
+        DataChain.from_dataset(ds_name, version=2)
+        .order_by("file.path")
+        .collect("file.path")
+    ) == [
+        "images/img10.jpg",
+        "images/img12.jpg",
+        "images/img14.jpg",
+        "images/img16.jpg",
+        "images/img18.jpg",
+        "images/img4.jpg",
+        "images/img6.jpg",
+        "images/img6.jpg",
+        "images/img8.jpg",
+        "images/img8.jpg",
+    ]
+
+    # check that we have both old and new version of those that are modified
+    rows = list(
+        DataChain.from_dataset(ds_name, version=2)
+        .filter(C("index") == 6)
+        .order_by("file.path", "file.etag")
+        .collect("file")
+    )
+    assert rows[0].etag == etags[6]
+    assert rows[1].etag > etags[6]  # new etag is bigger as it's the value of mtime
+
+
+def test_delta_update_no_diff(test_session, tmp_dir, tmp_path):
+    ds_name = "delta_ds"
+    path = tmp_dir.as_uri()
+    tmp_dir = tmp_dir / "images"
+    os.mkdir(tmp_dir)
+
+    images = [
+        {"name": f"img{i}.jpg", "data": Image.new(mode="RGB", size=(64, 128))}
+        for i in range(10)
+    ]
+
+    for img in images:
+        img["data"].save(tmp_dir / img["name"])
+
+    def create_delta_dataset():
+        def get_index(file: File) -> int:
+            r = r".+\/img(\d+)\.jpg"
+            return int(re.search(r, file.path).group(1))  # type: ignore[union-attr]
+
+        (
+            DataChain.from_storage(path, update=True, session=test_session)
+            .filter(C("file.path").glob("*.jpg"))
+            .map(index=get_index)
+            .filter(C("index") > 5)
+            .save(ds_name, delta=True)
+        )
+
+    create_delta_dataset()
+    create_delta_dataset()
+
+    assert (
+        list(
+            DataChain.from_dataset(ds_name, version=1)
+            .order_by("file.path")
+            .collect("file.path")
+        )
+        == list(
+            DataChain.from_dataset(ds_name, version=2)
+            .order_by("file.path")
+            .collect("file.path")
+        )
+        == [
+            "images/img6.jpg",
+            "images/img7.jpg",
+            "images/img8.jpg",
+            "images/img9.jpg",
+        ]
+    )
+
+
+def test_delta_update_no_file_signals(test_session):
+    starting_ds_name = "starting_ds"
+
+    DataChain.from_values(num=[10, 20], session=test_session).save(starting_ds_name)
+
+    with pytest.raises(ValueError) as excinfo:
+        DataChain.from_dataset(
+            starting_ds_name,
+            session=test_session,
+        ).save("delta_ds", delta=True)
+
+    assert (
+        str(excinfo.value) == "Datasets without file signals cannot have delta updates"
+    )

From 58c27f038fe8f1588842cafa891067945dad6dec Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Tue, 25 Feb 2025 16:39:04 +0100
Subject: [PATCH 12/45] updated docs

---
 src/datachain/lib/dc.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py
index f5e0f0168..5b4065bab 100644
--- a/src/datachain/lib/dc.py
+++ b/src/datachain/lib/dc.py
@@ -766,7 +766,11 @@ def save(  # type: ignore[override]
             name : dataset name. Empty name saves to a temporary dataset that will be
                 removed after process ends. Temp dataset are useful for optimization.
             version : version of a dataset. Default - the last version that exist.
-            delta : whether this is an delta dataset or not.
+            delta : If True, we optimize on creation of the new dataset versions
+                by calculating diff between source and the last version and applying
+                all needed modifications (mappers, filters etc.) only on that diff.
+                At the end, we merge modified diff with last version of dataset to
+                create new version.
         """
         schema = self.signals_schema.clone_without_sys_signals().serialize()
         if delta and name:

From 046731b7c354d571469e35306123015e44dc8563 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Wed, 5 Mar 2025 13:19:13 +0100
Subject: [PATCH 13/45] added sys columns explicitly

---
 src/datachain/diff/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/datachain/diff/__init__.py b/src/datachain/diff/__init__.py
index b325a2d29..0485ee6d5 100644
--- a/src/datachain/diff/__init__.py
+++ b/src/datachain/diff/__init__.py
@@ -137,6 +137,7 @@ def _to_list(obj: Optional[Union[str, Sequence[str]]]) -> Optional[list[str]]:
                 for c in [c for c in cols if c in right_cols]
             }
         )
+        .settings(sys=True)
         .select_except(ldiff_col, rdiff_col)
     )
 

From 9f52c8b132d8e358722f60a46a06583b98cc7fe0 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Thu, 6 Mar 2025 00:58:17 +0100
Subject: [PATCH 14/45] fixing delta to not have old versions in end result

---
 src/datachain/delta.py   |  6 +++++-
 tests/func/test_delta.py | 20 ++++++++++----------
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/src/datachain/delta.py b/src/datachain/delta.py
index 1b1406e3b..f3941f73e 100644
--- a/src/datachain/delta.py
+++ b/src/datachain/delta.py
@@ -36,4 +36,8 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]:
     diff._query.steps += dc._query.steps
 
     # merging diff and the latest version of our dataset
-    return diff.union(DataChain.from_dataset(name, latest_version))
+    return (
+        DataChain.from_dataset(name, latest_version)
+        .diff(diff, added=True, modified=False)
+        .union(diff)
+    )
diff --git a/tests/func/test_delta.py b/tests/func/test_delta.py
index 1b5f67784..96cd9792e 100644
--- a/tests/func/test_delta.py
+++ b/tests/func/test_delta.py
@@ -144,20 +144,20 @@ def get_index(file: File) -> int:
         "images/img18.jpg",
         "images/img4.jpg",
         "images/img6.jpg",
-        "images/img6.jpg",
-        "images/img8.jpg",
         "images/img8.jpg",
     ]
 
-    # check that we have both old and new version of those that are modified
-    rows = list(
-        DataChain.from_dataset(ds_name, version=2)
-        .filter(C("index") == 6)
-        .order_by("file.path", "file.etag")
-        .collect("file")
+    # check that we have newest versions for modified rows since etags are mtime
+    # and modified rows etags should be bigger than the old ones
+    assert (
+        next(
+            DataChain.from_dataset(ds_name, version=2)
+            .filter(C("index") == 6)
+            .order_by("file.path", "file.etag")
+            .collect("file.etag")
+        )
+        > etags[6]
     )
-    assert rows[0].etag == etags[6]
-    assert rows[1].etag > etags[6]  # new etag is bigger as it's the value of mtime
 
 
 def test_delta_update_no_diff(test_session, tmp_dir, tmp_path):

From 802a934919b692027f6c6d4228166b3ef384ba26 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Thu, 6 Mar 2025 01:48:51 +0100
Subject: [PATCH 15/45] added append steps

---
 src/datachain/delta.py               |  2 +-
 src/datachain/lib/dc.py              |  9 +++++++++
 src/datachain/lib/signal_schema.py   |  7 +++++++
 tests/unit/lib/test_datachain.py     | 18 ++++++++++++++++++
 tests/unit/lib/test_signal_schema.py |  6 ++++++
 5 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/src/datachain/delta.py b/src/datachain/delta.py
index f3941f73e..32ebdcbfb 100644
--- a/src/datachain/delta.py
+++ b/src/datachain/delta.py
@@ -33,7 +33,7 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]:
     # we append all the steps from the original chain to diff,
     # e.g filters, mappers, generators etc. With this we make sure we add all
     # needed modifications to diff part as well
-    diff._query.steps += dc._query.steps
+    diff = diff.append_steps(dc)
 
     # merging diff and the latest version of our dataset
     return (
diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py
index 1aca56dca..840959a71 100644
--- a/src/datachain/lib/dc.py
+++ b/src/datachain/lib/dc.py
@@ -334,6 +334,15 @@ def clone(self) -> "Self":
         """Make a copy of the chain in a new table."""
         return self._evolve(query=self._query.clone(new_table=True))
 
+    def append_steps(self, chain: "DataChain") -> "Self":
+        """Returns cloned chain with appended steps from other chain.
+        Steps are all those modification methods applied like filters, mappers etc.
+        """
+        dc = self.clone()
+        dc._query.steps += chain._query.steps
+        dc.signals_schema = dc.signals_schema.append(chain.signals_schema)
+        return dc
+
     def _evolve(
         self,
         *,
diff --git a/src/datachain/lib/signal_schema.py b/src/datachain/lib/signal_schema.py
index e6654250c..c32bf10d7 100644
--- a/src/datachain/lib/signal_schema.py
+++ b/src/datachain/lib/signal_schema.py
@@ -646,6 +646,13 @@ def merge(
 
         return SignalSchema(self.values | schema_right)
 
+    def append(self, right: "SignalSchema") -> "SignalSchema":
+        missing_schema = {
+            key: right.values[key]
+            for key in [k for k in right.values if k not in self.values]
+        }
+        return SignalSchema(self.values | missing_schema)
+
     def get_signals(self, target_type: type[DataModel]) -> Iterator[str]:
         for path, type_, has_subtree, _ in self.get_flat_tree():
             if has_subtree and issubclass(type_, target_type):
diff --git a/tests/unit/lib/test_datachain.py b/tests/unit/lib/test_datachain.py
index d5b442edd..3a10e8616 100644
--- a/tests/unit/lib/test_datachain.py
+++ b/tests/unit/lib/test_datachain.py
@@ -2998,3 +2998,21 @@ def test_window_error(test_session):
         ),
     ):
         dc.mutate(first=func.sum("col2").over(window))
+
+
+def test_append_steps(test_session):
+    keys = ["a", "b", "c", "d"]
+    values = [1, 2, 3, 4]
+
+    DataChain.from_values(key=keys, val=values, session=test_session).save("ds")
+
+    ds1 = (
+        DataChain.from_dataset("ds", session=test_session)
+        .filter(C("val") > 2)
+        .mutate(double=C("val") * 2)
+    )
+
+    ds2 = DataChain.from_dataset("ds", session=test_session).append_steps(ds1)
+
+    assert list(ds2.order_by("val").collect("val")) == [3, 4]
+    assert list(ds2.order_by("val").collect("double")) == [6, 8]
diff --git a/tests/unit/lib/test_signal_schema.py b/tests/unit/lib/test_signal_schema.py
index 637194ae1..d03f52807 100644
--- a/tests/unit/lib/test_signal_schema.py
+++ b/tests/unit/lib/test_signal_schema.py
@@ -1137,3 +1137,9 @@ class Custom(DataModel):
 def test_get_file_signal():
     assert SignalSchema({"name": str, "f": File}).get_file_signal() == "f"
     assert SignalSchema({"name": str}).get_file_signal() is None
+
+
+def test_append():
+    s1 = SignalSchema({"name": str, "f": File})
+    s2 = SignalSchema({"name": str, "f": File, "age": int})
+    assert s1.append(s2).values == {"name": str, "f": File, "age": int}

From f3a7b128970e1bc55a0ecf7cd2cf4bcc36ee9ffd Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Thu, 6 Mar 2025 13:14:39 +0100
Subject: [PATCH 16/45] fixing logic

---
 src/datachain/delta.py         | 20 ++++++++++++--------
 src/datachain/diff/__init__.py |  6 +++++-
 src/datachain/lib/dc.py        |  8 +++++++-
 3 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/src/datachain/delta.py b/src/datachain/delta.py
index 32ebdcbfb..b525252ca 100644
--- a/src/datachain/delta.py
+++ b/src/datachain/delta.py
@@ -27,17 +27,21 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]:
 
     source_ds_name = dc._query.starting_step.dataset_name
     source_ds_version = dc._query.starting_step.dataset_version
-    diff = DataChain.from_dataset(source_ds_name, version=source_ds_version).diff(
-        DataChain.from_dataset(name, version=latest_version), on=file_signal
+
+    diff = (
+        DataChain.from_dataset(source_ds_name, version=source_ds_version)
+        .diff(
+            DataChain.from_dataset(name, version=latest_version),
+            on=file_signal,
+            sys=True,
+        )
+        # We append all the steps from the original chain to diff, e.g filters, mappers.
+        .append_steps(dc)
     )
-    # we append all the steps from the original chain to diff,
-    # e.g filters, mappers, generators etc. With this we make sure we add all
-    # needed modifications to diff part as well
-    diff = diff.append_steps(dc)
 
-    # merging diff and the latest version of our dataset
+    # merging diff and the latest version of dataset
     return (
         DataChain.from_dataset(name, latest_version)
-        .diff(diff, added=True, modified=False)
+        .diff(diff, added=True, modified=False, sys=True)
         .union(diff)
     )
diff --git a/src/datachain/diff/__init__.py b/src/datachain/diff/__init__.py
index 0485ee6d5..d09931851 100644
--- a/src/datachain/diff/__init__.py
+++ b/src/datachain/diff/__init__.py
@@ -42,6 +42,7 @@ def _compare(  # noqa: C901
     modified: bool = True,
     same: bool = True,
     status_col: Optional[str] = None,
+    sys: Optional[bool] = False,
 ) -> "DataChain":
     """Comparing two chains by identifying rows that are added, deleted, modified
     or same"""
@@ -137,10 +138,13 @@ def _to_list(obj: Optional[Union[str, Sequence[str]]]) -> Optional[list[str]]:
                 for c in [c for c in cols if c in right_cols]
             }
         )
-        .settings(sys=True)
         .select_except(ldiff_col, rdiff_col)
     )
 
+    if sys:
+        # making sure we have sys signals in final diff chain
+        dc_diff = dc_diff.settings(sys=True)
+
     if not added:
         dc_diff = dc_diff.filter(C(diff_col) != CompareStatus.ADDED)
     if not modified:
diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py
index 840959a71..571c6014c 100644
--- a/src/datachain/lib/dc.py
+++ b/src/datachain/lib/dc.py
@@ -339,7 +339,7 @@ def append_steps(self, chain: "DataChain") -> "Self":
         Steps are all those modification methods applied like filters, mappers etc.
         """
         dc = self.clone()
-        dc._query.steps += chain._query.steps
+        dc._query.steps += chain._query.steps.copy()
         dc.signals_schema = dc.signals_schema.append(chain.signals_schema)
         return dc
 
@@ -1648,6 +1648,7 @@ def compare(
         modified: bool = True,
         same: bool = False,
         status_col: Optional[str] = None,
+        sys: Optional[bool] = False,
     ) -> "DataChain":
         """Comparing two chains by identifying rows that are added, deleted, modified
         or same. Result is the new chain that has additional column with possible
@@ -1680,6 +1681,7 @@ def compare(
             same (bool): Whether to return unchanged rows in resulting chain.
             status_col (str): Name of the new column that is created in resulting chain
                 representing diff status.
+            sys (bool): Whether to have sys columns in returned diff chain or not.
 
         Example:
             ```py
@@ -1710,6 +1712,7 @@ def compare(
             modified=modified,
             same=same,
             status_col=status_col,
+            sys=sys,
         )
 
     def diff(
@@ -1722,6 +1725,7 @@ def diff(
         deleted: bool = False,
         same: bool = False,
         status_col: Optional[str] = None,
+        sys: Optional[bool] = False,
     ) -> "DataChain":
         """Similar to `.compare()`, which is more generic method to calculate difference
         between two chains. Unlike `.compare()`, this method works only on those chains
@@ -1744,6 +1748,7 @@ def diff(
             same (bool): Whether to return unchanged rows in resulting chain.
             status_col (str): Optional name of the new column that is created in
                 resulting chain representing diff status.
+            sys (bool): Whether to have sys columns in returned diff chain or not.
 
         Example:
             ```py
@@ -1783,6 +1788,7 @@ def get_file_signals(file: str, signals):
             modified=modified,
             same=same,
             status_col=status_col,
+            sys=sys,
         )
 
     @classmethod

From d7b86233aac1b4c51561c6c45fc3122fcb1ce36c Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Tue, 11 Mar 2025 10:48:34 +0100
Subject: [PATCH 17/45] removed append steps from DataChain

---
 src/datachain/delta.py           | 26 +++++++++++++++++---------
 src/datachain/lib/dc.py          |  9 ---------
 tests/unit/lib/test_datachain.py | 18 ------------------
 3 files changed, 17 insertions(+), 36 deletions(-)

diff --git a/src/datachain/delta.py b/src/datachain/delta.py
index b525252ca..c570aa4d1 100644
--- a/src/datachain/delta.py
+++ b/src/datachain/delta.py
@@ -6,6 +6,16 @@
     from datachain.lib.dc import DataChain
 
 
+def _append_steps(dc: "DataChain", other: "DataChain"):
+    """Returns cloned chain with appended steps from other chain.
+    Steps are all those modification methods applied like filters, mappers etc.
+    """
+    dc = dc.clone()
+    dc._query.steps += other._query.steps.copy()
+    dc.signals_schema = dc.signals_schema.append(other.signals_schema)
+    return dc
+
+
 def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]:
     """
     Creates new chain that consists of the last version of current delta dataset
@@ -28,17 +38,15 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]:
     source_ds_name = dc._query.starting_step.dataset_name
     source_ds_version = dc._query.starting_step.dataset_version
 
-    diff = (
-        DataChain.from_dataset(source_ds_name, version=source_ds_version)
-        .diff(
-            DataChain.from_dataset(name, version=latest_version),
-            on=file_signal,
-            sys=True,
-        )
-        # We append all the steps from the original chain to diff, e.g filters, mappers.
-        .append_steps(dc)
+    diff = DataChain.from_dataset(source_ds_name, version=source_ds_version).diff(
+        DataChain.from_dataset(name, version=latest_version),
+        on=file_signal,
+        sys=True,
     )
 
+    # We append all the steps from the original chain to diff, e.g filters, mappers.
+    diff = _append_steps(diff, dc)
+
     # merging diff and the latest version of dataset
     return (
         DataChain.from_dataset(name, latest_version)
diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py
index 53a4f0625..fac14fc9e 100644
--- a/src/datachain/lib/dc.py
+++ b/src/datachain/lib/dc.py
@@ -334,15 +334,6 @@ def clone(self) -> "Self":
         """Make a copy of the chain in a new table."""
         return self._evolve(query=self._query.clone(new_table=True))
 
-    def append_steps(self, chain: "DataChain") -> "Self":
-        """Returns cloned chain with appended steps from other chain.
-        Steps are all those modification methods applied like filters, mappers etc.
-        """
-        dc = self.clone()
-        dc._query.steps += chain._query.steps.copy()
-        dc.signals_schema = dc.signals_schema.append(chain.signals_schema)
-        return dc
-
     def _evolve(
         self,
         *,
diff --git a/tests/unit/lib/test_datachain.py b/tests/unit/lib/test_datachain.py
index 3a10e8616..d5b442edd 100644
--- a/tests/unit/lib/test_datachain.py
+++ b/tests/unit/lib/test_datachain.py
@@ -2998,21 +2998,3 @@ def test_window_error(test_session):
         ),
     ):
         dc.mutate(first=func.sum("col2").over(window))
-
-
-def test_append_steps(test_session):
-    keys = ["a", "b", "c", "d"]
-    values = [1, 2, 3, 4]
-
-    DataChain.from_values(key=keys, val=values, session=test_session).save("ds")
-
-    ds1 = (
-        DataChain.from_dataset("ds", session=test_session)
-        .filter(C("val") > 2)
-        .mutate(double=C("val") * 2)
-    )
-
-    ds2 = DataChain.from_dataset("ds", session=test_session).append_steps(ds1)
-
-    assert list(ds2.order_by("val").collect("val")) == [3, 4]
-    assert list(ds2.order_by("val").collect("double")) == [6, 8]

From 0464c165c52418028471ed08d256c1d9be6ea8af Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Tue, 11 Mar 2025 13:02:52 +0100
Subject: [PATCH 18/45] added better docs

---
 src/datachain/lib/dc.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py
index fac14fc9e..aec6a6343 100644
--- a/src/datachain/lib/dc.py
+++ b/src/datachain/lib/dc.py
@@ -774,10 +774,19 @@ def save(  # type: ignore[override]
                 removed after process ends. Temp dataset are useful for optimization.
             version : version of a dataset. Default - the last version that exist.
             delta : If True, we optimize on creation of the new dataset versions
-                by calculating diff between source and the last version and applying
-                all needed modifications (mappers, filters etc.) only on that diff.
-                At the end, we merge modified diff with last version of dataset to
-                create new version.
+                by calculating diff between source and the last version of dataset
+                and applying all needed modifications (mappers, filters etc.) only
+                on that diff.
+                Then we merge modified diff with the last version of dataset to
+                create new version. This way we avoid applying modifications to all
+                records from source every time since that can be expensive operation.
+                Source can be cloud storage or other dataset which has File object
+                in schema.
+                Diff is calculated using `DataChain.diff()` method which looks into
+                File `source` and `path` for matching, and File `version` and `etag`
+                for checking if the record is changed.
+                Note that this takes in account only added and changed records in
+                source while deleted recordsare not removed in the new dataset version.
         """
         schema = self.signals_schema.clone_without_sys_signals().serialize()
         if delta and name:

From 8093000db38627e5b68390a6b0f0e62d8324204e Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Wed, 12 Mar 2025 16:26:10 +0100
Subject: [PATCH 19/45] removed sys flag

---
 src/datachain/delta.py         |  3 +--
 src/datachain/diff/__init__.py | 13 ++++++-------
 src/datachain/lib/dc.py        |  6 ------
 3 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/src/datachain/delta.py b/src/datachain/delta.py
index c570aa4d1..e15f79b0b 100644
--- a/src/datachain/delta.py
+++ b/src/datachain/delta.py
@@ -41,7 +41,6 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]:
     diff = DataChain.from_dataset(source_ds_name, version=source_ds_version).diff(
         DataChain.from_dataset(name, version=latest_version),
         on=file_signal,
-        sys=True,
     )
 
     # We append all the steps from the original chain to diff, e.g filters, mappers.
@@ -50,6 +49,6 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]:
     # merging diff and the latest version of dataset
     return (
         DataChain.from_dataset(name, latest_version)
-        .diff(diff, added=True, modified=False, sys=True)
+        .diff(diff, added=True, modified=False)
         .union(diff)
     )
diff --git a/src/datachain/diff/__init__.py b/src/datachain/diff/__init__.py
index d09931851..511bc044b 100644
--- a/src/datachain/diff/__init__.py
+++ b/src/datachain/diff/__init__.py
@@ -30,7 +30,7 @@ class CompareStatus(str, Enum):
     SAME = "S"
 
 
-def _compare(  # noqa: C901
+def _compare(  # noqa: C901, PLR0912
     left: "DataChain",
     right: "DataChain",
     on: Union[str, Sequence[str]],
@@ -42,7 +42,6 @@ def _compare(  # noqa: C901
     modified: bool = True,
     same: bool = True,
     status_col: Optional[str] = None,
-    sys: Optional[bool] = False,
 ) -> "DataChain":
     """Comparing two chains by identifying rows that are added, deleted, modified
     or same"""
@@ -141,10 +140,6 @@ def _to_list(obj: Optional[Union[str, Sequence[str]]]) -> Optional[list[str]]:
         .select_except(ldiff_col, rdiff_col)
     )
 
-    if sys:
-        # making sure we have sys signals in final diff chain
-        dc_diff = dc_diff.settings(sys=True)
-
     if not added:
         dc_diff = dc_diff.filter(C(diff_col) != CompareStatus.ADDED)
     if not modified:
@@ -157,7 +152,11 @@ def _to_list(obj: Optional[Union[str, Sequence[str]]]) -> Optional[list[str]]:
     if status_col:
         cols.append(diff_col)  # type: ignore[arg-type]
 
-    dc_diff = dc_diff.select(*cols)
+    if not dc_diff._sys:
+        # TODO workaround when sys signal is not available in diff
+        dc_diff = dc_diff.settings(sys=True).select(*cols).settings(sys=False)
+    else:
+        dc_diff = dc_diff.select(*cols)
 
     # final schema is schema from the left chain with status column added if needed
     dc_diff.signals_schema = (
diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py
index aec6a6343..bb9a0f683 100644
--- a/src/datachain/lib/dc.py
+++ b/src/datachain/lib/dc.py
@@ -1648,7 +1648,6 @@ def compare(
         modified: bool = True,
         same: bool = False,
         status_col: Optional[str] = None,
-        sys: Optional[bool] = False,
     ) -> "DataChain":
         """Comparing two chains by identifying rows that are added, deleted, modified
         or same. Result is the new chain that has additional column with possible
@@ -1681,7 +1680,6 @@ def compare(
             same (bool): Whether to return unchanged rows in resulting chain.
             status_col (str): Name of the new column that is created in resulting chain
                 representing diff status.
-            sys (bool): Whether to have sys columns in returned diff chain or not.
 
         Example:
             ```py
@@ -1712,7 +1710,6 @@ def compare(
             modified=modified,
             same=same,
             status_col=status_col,
-            sys=sys,
         )
 
     def diff(
@@ -1725,7 +1722,6 @@ def diff(
         deleted: bool = False,
         same: bool = False,
         status_col: Optional[str] = None,
-        sys: Optional[bool] = False,
     ) -> "DataChain":
         """Similar to `.compare()`, which is more generic method to calculate difference
         between two chains. Unlike `.compare()`, this method works only on those chains
@@ -1748,7 +1744,6 @@ def diff(
             same (bool): Whether to return unchanged rows in resulting chain.
             status_col (str): Optional name of the new column that is created in
                 resulting chain representing diff status.
-            sys (bool): Whether to have sys columns in returned diff chain or not.
 
         Example:
             ```py
@@ -1788,7 +1783,6 @@ def get_file_signals(file: str, signals):
             modified=modified,
             same=same,
             status_col=status_col,
-            sys=sys,
         )
 
     @classmethod

From 0dd71a2d9433b7b79344ebd60cb702fd402b9356 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Fri, 14 Mar 2025 08:56:21 +0100
Subject: [PATCH 20/45] fixing typo

---
 src/datachain/lib/dc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py
index 6e4821b7b..2c8fbd873 100644
--- a/src/datachain/lib/dc.py
+++ b/src/datachain/lib/dc.py
@@ -799,7 +799,7 @@ def save(  # type: ignore[override]
                 File `source` and `path` for matching, and File `version` and `etag`
                 for checking if the record is changed.
                 Note that this takes in account only added and changed records in
-                source while deleted recordsare not removed in the new dataset version.
+                source while deleted records are not removed in the new dataset version.
         """
         schema = self.signals_schema.clone_without_sys_signals().serialize()
         if delta and name:

From 2b29498504fd12b958df9f6bc90df7b8fb6390df Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Mon, 24 Mar 2025 16:08:21 +0100
Subject: [PATCH 21/45] added alternative delta approach

---
 src/datachain/dataset.py | 17 +++++++---------
 src/datachain/delta.py   | 42 ++++++++++++++++++++++++++++++++++++++++
 src/datachain/lib/dc.py  |  4 ++--
 3 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/src/datachain/dataset.py b/src/datachain/dataset.py
index 2fd718686..59602987a 100644
--- a/src/datachain/dataset.py
+++ b/src/datachain/dataset.py
@@ -105,24 +105,21 @@ def parse(
         dataset_version: Optional[int],
         dataset_version_created_at: Optional[datetime],
     ) -> Optional["DatasetDependency"]:
-        from datachain.client import Client
-        from datachain.lib.listing import is_listing_dataset, listing_uri_from_name
+        from datachain.lib.listing import is_listing_dataset
 
         if not dataset_id:
             return None
 
         assert dataset_name is not None
-        dependency_type = DatasetDependencyType.DATASET
-        dependency_name = dataset_name
-
-        if is_listing_dataset(dataset_name):
-            dependency_type = DatasetDependencyType.STORAGE  # type: ignore[arg-type]
-            dependency_name, _ = Client.parse_url(listing_uri_from_name(dataset_name))
 
         return cls(
             id,
-            dependency_type,
-            dependency_name,
+            (
+                DatasetDependencyType.STORAGE
+                if is_listing_dataset(dataset_name)
+                else DatasetDependencyType.DATASET
+            ),
+            dataset_name,
             (
                 str(dataset_version)  # type: ignore[arg-type]
                 if dataset_version
diff --git a/src/datachain/delta.py b/src/datachain/delta.py
index e15f79b0b..4b39defc3 100644
--- a/src/datachain/delta.py
+++ b/src/datachain/delta.py
@@ -52,3 +52,45 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]:
         .diff(diff, added=True, modified=False)
         .union(diff)
     )
+
+
+def delta_update_alternative(dc: "DataChain", name: str) -> Optional["DataChain"]:
+    from datachain.lib.dc import DataChain
+
+    catalog = dc.session.catalog
+    try:
+        latest_version = catalog.get_dataset(name).latest_version
+    except DatasetNotFoundError:
+        # first creation of delta update dataset
+        return None
+
+    dependencies = catalog.get_dataset_dependencies(name, latest_version)
+    if len(dependencies) > 1:
+        raise Exception("Cannot do delta with dataset that has multiple dependencies")
+
+    dep = dependencies[0]
+    if not dep:
+        # starting dataset (e.g listing) was removed so we are backing off to normal
+        # dataset creation, as it was created first time
+        return None
+
+    source_ds_name = dep.name
+    source_ds_version = int(dep.version)
+    source_ds_latest_version = catalog.get_dataset(source_ds_name).latest_version
+
+    source_dc = DataChain.from_dataset(source_ds_name, source_ds_version)
+    source_dc_latest = DataChain.from_dataset(source_ds_name, source_ds_latest_version)
+    file_signal = source_dc.signals_schema.get_file_signal()
+    if not file_signal:
+        raise ValueError("Datasets without file signals cannot have delta updates")
+
+    diff = source_dc_latest.diff(source_dc, on=file_signal)
+    # We append all the steps from the original chain to diff, e.g filters, mappers.
+    diff = _append_steps(diff, dc)
+
+    # merging diff and the latest version of dataset
+    return (
+        DataChain.from_dataset(name, latest_version)
+        .diff(diff, added=True, modified=False)
+        .union(diff)
+    )
diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py
index 1b182b236..20a34e539 100644
--- a/src/datachain/lib/dc.py
+++ b/src/datachain/lib/dc.py
@@ -26,7 +26,7 @@
 from tqdm import tqdm
 
 from datachain.dataset import DatasetRecord
-from datachain.delta import delta_update
+from datachain.delta import delta_update_alternative
 from datachain.func import literal
 from datachain.func.base import Function
 from datachain.func.func import Func
@@ -821,7 +821,7 @@ def save(  # type: ignore[override]
         """
         schema = self.signals_schema.clone_without_sys_signals().serialize()
         if delta and name:
-            delta_ds = delta_update(self, name)
+            delta_ds = delta_update_alternative(self, name)
             if delta_ds:
                 return self._evolve(
                     query=delta_ds._query.save(

From e085280366b9da9b60a42273de3d664be31bb739 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Tue, 15 Apr 2025 03:38:39 +0200
Subject: [PATCH 22/45] fixing delta due to lazy listing changes

---
 src/datachain/delta.py            | 18 +++++++-------
 src/datachain/lib/dc/datachain.py |  4 ++--
 src/datachain/query/dataset.py    | 25 +++++++++++++-------
 tests/func/test_delta.py          | 39 +++++++++++++------------------
 4 files changed, 44 insertions(+), 42 deletions(-)

diff --git a/src/datachain/delta.py b/src/datachain/delta.py
index 4b39defc3..ab41b91e0 100644
--- a/src/datachain/delta.py
+++ b/src/datachain/delta.py
@@ -1,5 +1,6 @@
 from typing import TYPE_CHECKING, Optional
 
+import datachain
 from datachain.error import DatasetNotFoundError
 
 if TYPE_CHECKING:
@@ -24,7 +25,7 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]:
     apply all the DataChain methods like filters, mappers, generators etc.)
     but just the diff part which is very important for performance.
     """
-    from datachain.lib.dc import DataChain
+    dc._query.apply_listing_pre_step()
 
     file_signal = dc.signals_schema.get_file_signal()
     if not file_signal:
@@ -35,11 +36,12 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]:
         # first creation of delta update dataset
         return None
 
+    assert dc._query.starting_step
     source_ds_name = dc._query.starting_step.dataset_name
     source_ds_version = dc._query.starting_step.dataset_version
 
-    diff = DataChain.from_dataset(source_ds_name, version=source_ds_version).diff(
-        DataChain.from_dataset(name, version=latest_version),
+    diff = datachain.read_dataset(source_ds_name, version=source_ds_version).diff(
+        datachain.read_dataset(name, version=latest_version),
         on=file_signal,
     )
 
@@ -48,15 +50,13 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]:
 
     # merging diff and the latest version of dataset
     return (
-        DataChain.from_dataset(name, latest_version)
+        datachain.read_dataset(name, latest_version)
         .diff(diff, added=True, modified=False)
         .union(diff)
     )
 
 
 def delta_update_alternative(dc: "DataChain", name: str) -> Optional["DataChain"]:
-    from datachain.lib.dc import DataChain
-
     catalog = dc.session.catalog
     try:
         latest_version = catalog.get_dataset(name).latest_version
@@ -78,8 +78,8 @@ def delta_update_alternative(dc: "DataChain", name: str) -> Optional["DataChain"
     source_ds_version = int(dep.version)
     source_ds_latest_version = catalog.get_dataset(source_ds_name).latest_version
 
-    source_dc = DataChain.from_dataset(source_ds_name, source_ds_version)
-    source_dc_latest = DataChain.from_dataset(source_ds_name, source_ds_latest_version)
+    source_dc = datachain.read_dataset(source_ds_name, source_ds_version)
+    source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version)
     file_signal = source_dc.signals_schema.get_file_signal()
     if not file_signal:
         raise ValueError("Datasets without file signals cannot have delta updates")
@@ -90,7 +90,7 @@ def delta_update_alternative(dc: "DataChain", name: str) -> Optional["DataChain"
 
     # merging diff and the latest version of dataset
     return (
-        DataChain.from_dataset(name, latest_version)
+        datachain.read_dataset(name, latest_version)
         .diff(diff, added=True, modified=False)
         .union(diff)
     )
diff --git a/src/datachain/lib/dc/datachain.py b/src/datachain/lib/dc/datachain.py
index 205c53fff..a09832c8f 100644
--- a/src/datachain/lib/dc/datachain.py
+++ b/src/datachain/lib/dc/datachain.py
@@ -24,7 +24,7 @@
 from tqdm import tqdm
 
 from datachain.dataset import DatasetRecord
-from datachain.delta import delta_update_alternative
+from datachain.delta import delta_update
 from datachain.func import literal
 from datachain.func.base import Function
 from datachain.func.func import Func
@@ -488,7 +488,7 @@ def save(  # type: ignore[override]
         """
         schema = self.signals_schema.clone_without_sys_signals().serialize()
         if delta and name:
-            delta_ds = delta_update_alternative(self, name)
+            delta_ds = delta_update(self, name)
             if delta_ds:
                 return self._evolve(
                     query=delta_ds._query.save(
diff --git a/src/datachain/query/dataset.py b/src/datachain/query/dataset.py
index 49297e769..ea881f13e 100644
--- a/src/datachain/query/dataset.py
+++ b/src/datachain/query/dataset.py
@@ -1113,9 +1113,14 @@ def __init__(
             self.version = version
 
         if is_listing_dataset(name):
-            # not setting query step yet as listing dataset might not exist at
-            # this point
-            self.list_ds_name = name
+            if version:
+                # this listing dataset should already be listed as we specify
+                # exact version
+                self._set_starting_step(self.catalog.get_dataset(name))
+            else:
+                # not setting query step yet as listing dataset might not exist at
+                # this point
+                self.list_ds_name = name
         elif fallback_to_studio and is_token_set():
             self._set_starting_step(
                 self.catalog.get_dataset_with_remote_fallback(name, version)
@@ -1201,11 +1206,8 @@ def set_listing_fn(self, fn: Callable) -> None:
         """Setting listing function to be run if needed"""
         self.listing_fn = fn
 
-    def apply_steps(self) -> QueryGenerator:
-        """
-        Apply the steps in the query and return the resulting
-        sqlalchemy.SelectBase.
-        """
+    def apply_listing_pre_step(self) -> None:
+        """Runs listing pre-step if needed"""
         if self.list_ds_name and not self.starting_step:
             listing_ds = None
             try:
@@ -1221,6 +1223,13 @@ def apply_steps(self) -> QueryGenerator:
             # at this point we know what is our starting listing dataset name
             self._set_starting_step(listing_ds)  # type: ignore [arg-type]
 
+    def apply_steps(self) -> QueryGenerator:
+        """
+        Apply the steps in the query and return the resulting
+        sqlalchemy.SelectBase.
+        """
+        self.apply_listing_pre_step()
+
         query = self.clone()
 
         index = os.getenv("DATACHAIN_QUERY_CHUNK_INDEX", self._chunk_index)
diff --git a/tests/func/test_delta.py b/tests/func/test_delta.py
index 96cd9792e..5c4988f3a 100644
--- a/tests/func/test_delta.py
+++ b/tests/func/test_delta.py
@@ -4,8 +4,9 @@
 import regex as re
 from PIL import Image
 
+import datachain as dc
 from datachain import func
-from datachain.lib.dc import C, DataChain
+from datachain.lib.dc import C
 from datachain.lib.file import File, ImageFile
 
 
@@ -21,7 +22,7 @@ def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path):
     ]
 
     def create_image_dataset(ds_name, images):
-        DataChain.from_values(
+        dc.read_values(
             file=[
                 ImageFile(path=img["name"], source=f"file://{tmp_path}")
                 for img in images
@@ -30,7 +31,7 @@ def create_image_dataset(ds_name, images):
         ).save(ds_name)
 
     def create_delta_dataset(ds_name):
-        DataChain.from_dataset(
+        dc.read_dataset(
             starting_ds_name,
             session=test_session,
         ).save(ds_name, delta=True)
@@ -45,18 +46,14 @@ def create_delta_dataset(ds_name):
     create_delta_dataset(ds_name)
 
     assert list(
-        DataChain.from_dataset(ds_name, version=1)
-        .order_by("file.path")
-        .collect("file.path")
+        dc.read_dataset(ds_name, version=1).order_by("file.path").collect("file.path")
     ) == [
         "img1.jpg",
         "img2.jpg",
     ]
 
     assert list(
-        DataChain.from_dataset(ds_name, version=2)
-        .order_by("file.path")
-        .collect("file.path")
+        dc.read_dataset(ds_name, version=2).order_by("file.path").collect("file.path")
     ) == [
         "img1.jpg",
         "img2.jpg",
@@ -92,7 +89,7 @@ def get_index(file: File) -> int:
             return int(re.search(r, file.path).group(1))  # type: ignore[union-attr]
 
         (
-            DataChain.from_storage(path, update=True, session=test_session)
+            dc.read_storage(path, update=True, session=test_session)
             .filter(C("file.path").glob("*.jpg"))
             .map(emb=my_embedding)
             .mutate(dist=func.cosine_distance("emb", (0.1, 0.2)))
@@ -108,7 +105,7 @@ def get_index(file: File) -> int:
     # into consideration on delta update
     etags = {
         r[0]: r[1].etag
-        for r in DataChain.from_dataset(ds_name, version=1).collect("index", "file")
+        for r in dc.read_dataset(ds_name, version=1).collect("index", "file")
     }
 
     # remove last couple of images to simulate modification since we will re-create it
@@ -123,9 +120,7 @@ def get_index(file: File) -> int:
     create_delta_dataset()
 
     assert list(
-        DataChain.from_dataset(ds_name, version=1)
-        .order_by("file.path")
-        .collect("file.path")
+        dc.read_dataset(ds_name, version=1).order_by("file.path").collect("file.path")
     ) == [
         "images/img4.jpg",
         "images/img6.jpg",
@@ -133,9 +128,7 @@ def get_index(file: File) -> int:
     ]
 
     assert list(
-        DataChain.from_dataset(ds_name, version=2)
-        .order_by("file.path")
-        .collect("file.path")
+        dc.read_dataset(ds_name, version=2).order_by("file.path").collect("file.path")
     ) == [
         "images/img10.jpg",
         "images/img12.jpg",
@@ -151,7 +144,7 @@ def get_index(file: File) -> int:
     # and modified rows etags should be bigger than the old ones
     assert (
         next(
-            DataChain.from_dataset(ds_name, version=2)
+            dc.read_dataset(ds_name, version=2)
             .filter(C("index") == 6)
             .order_by("file.path", "file.etag")
             .collect("file.etag")
@@ -180,7 +173,7 @@ def get_index(file: File) -> int:
             return int(re.search(r, file.path).group(1))  # type: ignore[union-attr]
 
         (
-            DataChain.from_storage(path, update=True, session=test_session)
+            dc.read_storage(path, update=True, session=test_session)
             .filter(C("file.path").glob("*.jpg"))
             .map(index=get_index)
             .filter(C("index") > 5)
@@ -192,12 +185,12 @@ def get_index(file: File) -> int:
 
     assert (
         list(
-            DataChain.from_dataset(ds_name, version=1)
+            dc.read_dataset(ds_name, version=1)
             .order_by("file.path")
             .collect("file.path")
         )
         == list(
-            DataChain.from_dataset(ds_name, version=2)
+            dc.read_dataset(ds_name, version=2)
             .order_by("file.path")
             .collect("file.path")
         )
@@ -213,10 +206,10 @@ def get_index(file: File) -> int:
 def test_delta_update_no_file_signals(test_session):
     starting_ds_name = "starting_ds"
 
-    DataChain.from_values(num=[10, 20], session=test_session).save(starting_ds_name)
+    dc.read_values(num=[10, 20], session=test_session).save(starting_ds_name)
 
     with pytest.raises(ValueError) as excinfo:
-        DataChain.from_dataset(
+        dc.read_dataset(
             starting_ds_name,
             session=test_session,
         ).save("delta_ds", delta=True)

From 735af026a7598fd097d7427fd92ef1424af8b6ce Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Tue, 15 Apr 2025 10:41:20 +0200
Subject: [PATCH 23/45] fixing datasetdependencies

---
 src/datachain/dataset.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/datachain/dataset.py b/src/datachain/dataset.py
index 59602987a..2fd718686 100644
--- a/src/datachain/dataset.py
+++ b/src/datachain/dataset.py
@@ -105,21 +105,24 @@ def parse(
         dataset_version: Optional[int],
         dataset_version_created_at: Optional[datetime],
     ) -> Optional["DatasetDependency"]:
-        from datachain.lib.listing import is_listing_dataset
+        from datachain.client import Client
+        from datachain.lib.listing import is_listing_dataset, listing_uri_from_name
 
         if not dataset_id:
             return None
 
         assert dataset_name is not None
+        dependency_type = DatasetDependencyType.DATASET
+        dependency_name = dataset_name
+
+        if is_listing_dataset(dataset_name):
+            dependency_type = DatasetDependencyType.STORAGE  # type: ignore[arg-type]
+            dependency_name, _ = Client.parse_url(listing_uri_from_name(dataset_name))
 
         return cls(
             id,
-            (
-                DatasetDependencyType.STORAGE
-                if is_listing_dataset(dataset_name)
-                else DatasetDependencyType.DATASET
-            ),
-            dataset_name,
+            dependency_type,
+            dependency_name,
             (
                 str(dataset_version)  # type: ignore[arg-type]
                 if dataset_version

From f3ebf97397f4c9288b0799cd3c69008d99cfef26 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Tue, 15 Apr 2025 11:47:55 +0200
Subject: [PATCH 24/45] returning function

---
 src/datachain/lib/signal_schema.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/datachain/lib/signal_schema.py b/src/datachain/lib/signal_schema.py
index b011949cf..fd9a17e1a 100644
--- a/src/datachain/lib/signal_schema.py
+++ b/src/datachain/lib/signal_schema.py
@@ -461,13 +461,14 @@ def row_to_objs(self, row: Sequence[Any]) -> list[DataValue]:
                 pos += 1
         return objs
 
-    def get_file_signal(self) -> Optional[str]:
-        for signal_name, signal_type in self.values.items():
-            if (fr := ModelStore.to_pydantic(signal_type)) is not None and issubclass(
+    def contains_file(self) -> bool:
+        for type_ in self.values.values():
+            if (fr := ModelStore.to_pydantic(type_)) is not None and issubclass(
                 fr, File
             ):
-                return signal_name
-        return None
+                return True
+
+        return False
 
     def slice(
         self,

From 59b7666b0119042554104e3767b230ceb6f1c4e7 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Tue, 15 Apr 2025 16:09:06 +0200
Subject: [PATCH 25/45] renaming method

---
 src/datachain/lib/signal_schema.py   | 11 +++++------
 tests/unit/lib/test_signal_schema.py | 11 -----------
 2 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/src/datachain/lib/signal_schema.py b/src/datachain/lib/signal_schema.py
index fd9a17e1a..b011949cf 100644
--- a/src/datachain/lib/signal_schema.py
+++ b/src/datachain/lib/signal_schema.py
@@ -461,14 +461,13 @@ def row_to_objs(self, row: Sequence[Any]) -> list[DataValue]:
                 pos += 1
         return objs
 
-    def contains_file(self) -> bool:
-        for type_ in self.values.values():
-            if (fr := ModelStore.to_pydantic(type_)) is not None and issubclass(
+    def get_file_signal(self) -> Optional[str]:
+        for signal_name, signal_type in self.values.items():
+            if (fr := ModelStore.to_pydantic(signal_type)) is not None and issubclass(
                 fr, File
             ):
-                return True
-
-        return False
+                return signal_name
+        return None
 
     def slice(
         self,
diff --git a/tests/unit/lib/test_signal_schema.py b/tests/unit/lib/test_signal_schema.py
index 26ddc7b5d..2eb7fb769 100644
--- a/tests/unit/lib/test_signal_schema.py
+++ b/tests/unit/lib/test_signal_schema.py
@@ -1041,17 +1041,6 @@ def test_get_flatten_hidden_fields(schema, hidden_fields):
     assert SignalSchema.get_flatten_hidden_fields(schema_serialized) == hidden_fields
 
 
-@pytest.mark.parametrize(
-    "schema,result",
-    [
-        ({"name": str, "value": int}, False),
-        ({"name": str, "age": float, "f": File}, True),
-    ],
-)
-def test_contains_file(schema, result):
-    assert SignalSchema(schema).contains_file() is result
-
-
 def test_slice():
     schema = {"name": str, "age": float, "address": str}
     setup_values = {"init": lambda: 37}

From 7d0a28308b879bd580859051307dc59fab95cd63 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Tue, 22 Apr 2025 16:21:42 +0200
Subject: [PATCH 26/45] leaving only alternative implementation

---
 src/datachain/dataset.py | 17 +++++-------
 src/datachain/delta.py   | 56 ++++++++++++++--------------------------
 tests/func/test_delta.py |  3 ++-
 3 files changed, 29 insertions(+), 47 deletions(-)

diff --git a/src/datachain/dataset.py b/src/datachain/dataset.py
index 38d53fdcc..8934d34a6 100644
--- a/src/datachain/dataset.py
+++ b/src/datachain/dataset.py
@@ -105,24 +105,21 @@ def parse(
         dataset_version: Optional[int],
         dataset_version_created_at: Optional[datetime],
     ) -> Optional["DatasetDependency"]:
-        from datachain.client import Client
-        from datachain.lib.listing import is_listing_dataset, listing_uri_from_name
+        from datachain.lib.listing import is_listing_dataset
 
         if not dataset_id:
             return None
 
         assert dataset_name is not None
-        dependency_type = DatasetDependencyType.DATASET
-        dependency_name = dataset_name
-
-        if is_listing_dataset(dataset_name):
-            dependency_type = DatasetDependencyType.STORAGE  # type: ignore[arg-type]
-            dependency_name, _ = Client.parse_url(listing_uri_from_name(dataset_name))
 
         return cls(
             id,
-            dependency_type,
-            dependency_name,
+            (
+                DatasetDependencyType.STORAGE
+                if is_listing_dataset(dataset_name)
+                else DatasetDependencyType.DATASET
+            ),
+            dataset_name,
             (
                 str(dataset_version)  # type: ignore[arg-type]
                 if dataset_version
diff --git a/src/datachain/delta.py b/src/datachain/delta.py
index ab41b91e0..f375e4501 100644
--- a/src/datachain/delta.py
+++ b/src/datachain/delta.py
@@ -24,40 +24,16 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]:
     This way we don't need to re-calculate the whole chain from the source again(
     apply all the DataChain methods like filters, mappers, generators etc.)
     but just the diff part which is very important for performance.
+
+    Note that currently delta update works only if there is only one direct dependency.
     """
+    catalog = dc.session.catalog
     dc._query.apply_listing_pre_step()
 
-    file_signal = dc.signals_schema.get_file_signal()
-    if not file_signal:
-        raise ValueError("Datasets without file signals cannot have delta updates")
-    try:
-        latest_version = dc.session.catalog.get_dataset(name).latest_version
-    except DatasetNotFoundError:
-        # first creation of delta update dataset
-        return None
-
-    assert dc._query.starting_step
-    source_ds_name = dc._query.starting_step.dataset_name
-    source_ds_version = dc._query.starting_step.dataset_version
-
-    diff = datachain.read_dataset(source_ds_name, version=source_ds_version).diff(
-        datachain.read_dataset(name, version=latest_version),
-        on=file_signal,
-    )
-
-    # We append all the steps from the original chain to diff, e.g filters, mappers.
-    diff = _append_steps(diff, dc)
-
-    # merging diff and the latest version of dataset
-    return (
-        datachain.read_dataset(name, latest_version)
-        .diff(diff, added=True, modified=False)
-        .union(diff)
-    )
+    chain_file_signal = dc.signals_schema.get_file_signal()
+    if not chain_file_signal:
+        raise ValueError("Chain doesn't produce file signal, cannot do delta update")
 
-
-def delta_update_alternative(dc: "DataChain", name: str) -> Optional["DataChain"]:
-    catalog = dc.session.catalog
     try:
         latest_version = catalog.get_dataset(name).latest_version
     except DatasetNotFoundError:
@@ -66,7 +42,9 @@ def delta_update_alternative(dc: "DataChain", name: str) -> Optional["DataChain"
 
     dependencies = catalog.get_dataset_dependencies(name, latest_version)
     if len(dependencies) > 1:
-        raise Exception("Cannot do delta with dataset that has multiple dependencies")
+        raise Exception(
+            "Cannot do delta with dataset that has multiple direct dependencies"
+        )
 
     dep = dependencies[0]
     if not dep:
@@ -80,17 +58,23 @@ def delta_update_alternative(dc: "DataChain", name: str) -> Optional["DataChain"
 
     source_dc = datachain.read_dataset(source_ds_name, source_ds_version)
     source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version)
-    file_signal = source_dc.signals_schema.get_file_signal()
-    if not file_signal:
-        raise ValueError("Datasets without file signals cannot have delta updates")
+    source_file_signal = source_dc.signals_schema.get_file_signal()
+    if not source_file_signal:
+        raise ValueError("Source dataset doesn't have file signals")
 
-    diff = source_dc_latest.diff(source_dc, on=file_signal)
+    diff = source_dc_latest.diff(source_dc, on=source_file_signal)
     # We append all the steps from the original chain to diff, e.g filters, mappers.
     diff = _append_steps(diff, dc)
 
     # merging diff and the latest version of dataset
     return (
         datachain.read_dataset(name, latest_version)
-        .diff(diff, added=True, modified=False)
+        .diff(
+            diff,
+            on=chain_file_signal,
+            right_on=source_file_signal,
+            added=True,
+            modified=False,
+        )
         .union(diff)
     )
diff --git a/tests/func/test_delta.py b/tests/func/test_delta.py
index 5c4988f3a..74d7f78d0 100644
--- a/tests/func/test_delta.py
+++ b/tests/func/test_delta.py
@@ -215,5 +215,6 @@ def test_delta_update_no_file_signals(test_session):
         ).save("delta_ds", delta=True)
 
     assert (
-        str(excinfo.value) == "Datasets without file signals cannot have delta updates"
+        str(excinfo.value)
+        == "Chain doesn't produce file signal, cannot do delta update"
     )

From 95a206bf787dd9ac282dee8e3b923034138b326a Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Wed, 23 Apr 2025 01:52:11 +0200
Subject: [PATCH 27/45] fixing tests

---
 tests/func/test_dataset_query.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/tests/func/test_dataset_query.py b/tests/func/test_dataset_query.py
index 093b7d797..7af2ddc26 100644
--- a/tests/func/test_dataset_query.py
+++ b/tests/func/test_dataset_query.py
@@ -10,6 +10,7 @@
 from datachain.error import (
     DatasetVersionNotFoundError,
 )
+from datachain.lib.listing import parse_listing_uri
 from datachain.query import C, DatasetQuery, Object, Stream
 from datachain.sql.functions import path as pathfunc
 from datachain.sql.types import String
@@ -956,6 +957,9 @@ def test_dataset_dependencies_one_storage_as_dependency(
     ds_name = uuid.uuid4().hex
     catalog = cloud_test_catalog.catalog
     listing = catalog.listings()[0]
+    dep_name, _, _ = parse_listing_uri(
+        cloud_test_catalog.src_uri, catalog.client_config
+    )
 
     DatasetQuery(cats_dataset.name, catalog=catalog).save(ds_name)
 
@@ -968,7 +972,7 @@ def test_dataset_dependencies_one_storage_as_dependency(
         {
             "id": ANY,
             "type": DatasetDependencyType.STORAGE,
-            "name": cloud_test_catalog.src_uri,
+            "name": dep_name,
             "version": str(1),
             "created_at": listing.created_at,
             "dependencies": [],
@@ -984,6 +988,10 @@ def test_dataset_dependencies_one_registered_dataset_as_dependency(
     catalog = cloud_test_catalog.catalog
     listing = catalog.listings()[0]
 
+    dep_name, _, _ = parse_listing_uri(
+        cloud_test_catalog.src_uri, catalog.client_config
+    )
+
     DatasetQuery(name=dogs_dataset.name, catalog=catalog).save(ds_name)
 
     expected = [
@@ -1002,7 +1010,7 @@ def test_dataset_dependencies_one_registered_dataset_as_dependency(
             {
                 "id": ANY,
                 "type": DatasetDependencyType.STORAGE,
-                "name": cloud_test_catalog.src_uri,
+                "name": dep_name,
                 "version": str(1),
                 "created_at": listing.created_at,
                 "dependencies": [],
@@ -1028,6 +1036,9 @@ def test_dataset_dependencies_multiple_direct_dataset_dependencies(
     ds_name = uuid.uuid4().hex
     catalog = cloud_test_catalog.catalog
     listing = catalog.listings()[0]
+    dep_name, _, _ = parse_listing_uri(
+        cloud_test_catalog.src_uri, catalog.client_config
+    )
 
     dogs = DatasetQuery(name=dogs_dataset.name, version=1, catalog=catalog)
     cats = DatasetQuery(name=cats_dataset.name, version=1, catalog=catalog)
@@ -1040,7 +1051,7 @@ def test_dataset_dependencies_multiple_direct_dataset_dependencies(
     storage_depenedncy = {
         "id": ANY,
         "type": DatasetDependencyType.STORAGE,
-        "name": cloud_test_catalog.src_uri,
+        "name": dep_name,
         "version": str(1),
         "created_at": listing.created_at,
         "dependencies": [],
@@ -1097,6 +1108,9 @@ def test_dataset_dependencies_multiple_union(
     ds_name = uuid.uuid4().hex
     catalog = cloud_test_catalog.catalog
     listing = catalog.listings()[0]
+    dep_name, _, _ = parse_listing_uri(
+        cloud_test_catalog.src_uri, catalog.client_config
+    )
 
     dogs = DatasetQuery(name=dogs_dataset.name, version=1, catalog=catalog)
     cats = DatasetQuery(name=cats_dataset.name, version=1, catalog=catalog)
@@ -1107,7 +1121,7 @@ def test_dataset_dependencies_multiple_union(
     storage_depenedncy = {
         "id": ANY,
         "type": DatasetDependencyType.STORAGE,
-        "name": cloud_test_catalog.src_uri,
+        "name": dep_name,
         "version": str(1),
         "created_at": listing.created_at,
         "dependencies": [],

From de7232983b8c8e782588bf700e7dc8c8e6087fe5 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Wed, 23 Apr 2025 02:02:35 +0200
Subject: [PATCH 28/45] fixing tests

---
 tests/func/test_datasets.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/func/test_datasets.py b/tests/func/test_datasets.py
index f47eb8d2b..d55d09423 100644
--- a/tests/func/test_datasets.py
+++ b/tests/func/test_datasets.py
@@ -854,6 +854,7 @@ def test_dataset_storage_dependencies(cloud_test_catalog, cloud_type, indirect):
     session = ctc.session
     catalog = session.catalog
     uri = cloud_test_catalog.src_uri
+    dep_name, _, _ = parse_listing_uri(ctc.src_uri, catalog.client_config)
 
     ds_name = "some_ds"
     dc.read_storage(uri, session=session).save(ds_name)
@@ -868,7 +869,7 @@ def test_dataset_storage_dependencies(cloud_test_catalog, cloud_type, indirect):
         {
             "id": ANY,
             "type": DatasetDependencyType.STORAGE,
-            "name": uri,
+            "name": dep_name,
             "version": "1",
             "created_at": lst_dataset.get_version(1).created_at,
             "dependencies": [],

From 55269ab250b099050fa159aa84dfccf7a0a54d43 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Wed, 23 Apr 2025 10:55:45 +0200
Subject: [PATCH 29/45] fixing tests

---
 tests/func/test_datachain.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py
index 56094017d..030d125af 100644
--- a/tests/func/test_datachain.py
+++ b/tests/func/test_datachain.py
@@ -231,15 +231,13 @@ def test_read_storage_dependencies(cloud_test_catalog, cloud_type):
     ctc = cloud_test_catalog
     src_uri = ctc.src_uri
     uri = f"{src_uri}/cats"
+    dep_name, _, _ = parse_listing_uri(uri, ctc.catalog.client_config)
     ds_name = "dep"
     dc.read_storage(uri, session=ctc.session).save(ds_name)
     dependencies = ctc.session.catalog.get_dataset_dependencies(ds_name, 1)
     assert len(dependencies) == 1
     assert dependencies[0].type == DatasetDependencyType.STORAGE
-    if cloud_type == "file":
-        assert dependencies[0].name == uri
-    else:
-        assert dependencies[0].name == src_uri
+    assert dependencies[0].name == dep_name
 
 
 @pytest.mark.parametrize("use_cache", [True, False])

From b7b16bad3e89ad0f2f40a4a69e63376bbc7edf07 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Fri, 25 Apr 2025 13:44:09 +0200
Subject: [PATCH 30/45] updating docs

---
 src/datachain/delta.py            |  4 +-
 src/datachain/lib/dc/datachain.py | 45 ++++++++++++++--
 src/datachain/lib/dc/datasets.py  | 18 ++++++-
 src/datachain/lib/dc/storage.py   | 20 ++++++-
 tests/func/test_delta.py          | 88 +++++++++++++++++++++++++++----
 5 files changed, 159 insertions(+), 16 deletions(-)

diff --git a/src/datachain/delta.py b/src/datachain/delta.py
index f375e4501..1fd116a35 100644
--- a/src/datachain/delta.py
+++ b/src/datachain/delta.py
@@ -40,7 +40,9 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]:
         # first creation of delta update dataset
         return None
 
-    dependencies = catalog.get_dataset_dependencies(name, latest_version)
+    dependencies = catalog.get_dataset_dependencies(
+        name, latest_version, indirect=False
+    )
     if len(dependencies) > 1:
         raise Exception(
             "Cannot do delta with dataset that has multiple direct dependencies"
diff --git a/src/datachain/lib/dc/datachain.py b/src/datachain/lib/dc/datachain.py
index 1ac897993..d0490dd33 100644
--- a/src/datachain/lib/dc/datachain.py
+++ b/src/datachain/lib/dc/datachain.py
@@ -4,6 +4,7 @@
 import sys
 import warnings
 from collections.abc import Iterator, Sequence
+from functools import wraps
 from typing import (
     IO,
     TYPE_CHECKING,
@@ -67,11 +68,34 @@
 
 if TYPE_CHECKING:
     import pandas as pd
-    from typing_extensions import ParamSpec, Self
+    from typing_extensions import Concatenate, ParamSpec, Self
 
     P = ParamSpec("P")
 
 
+T = TypeVar("T", bound="DataChain")
+
+
+def delta_disabled(
+    method: "Callable[Concatenate[T, P], T]",
+) -> "Callable[Concatenate[T, P], T]":
+    """
+    Decorator for disabling DataChain methods (e.g `.agg()` or `.union()`) to
+    work with delta updates. It throws `NotImplementedError` if chain on which
+    method is called is marked as delta.
+    """
+
+    @wraps(method)
+    def _inner(self: T, *args: "P.args", **kwargs: "P.kwargs") -> T:
+        if self.delta:
+            raise NotImplementedError(
+                f"Delta update cannot be used with {method.__name__}"
+            )
+        return method(self, *args, **kwargs)
+
+    return _inner
+
+
 class DataChain:
     """DataChain - a data structure for batch data processing and evaluation.
 
@@ -164,6 +188,7 @@ def __init__(
         self.signals_schema = signal_schema
         self._setup: dict = setup or {}
         self._sys = _sys
+        self._delta = False
 
     def __repr__(self) -> str:
         """Return a string representation of the chain."""
@@ -177,6 +202,16 @@ def __repr__(self) -> str:
         self.print_schema(file=file)
         return file.getvalue()
 
+    def as_delta(self, delta: bool = False) -> "Self":
+        """Marks this chain as delta, which means special delta process will be
+        called on saving dataset for optimization"""
+        self._delta = delta
+        return self
+
+    @property
+    def delta(self) -> bool:
+        return self._delta
+
     @property
     def schema(self) -> dict[str, DataType]:
         """Get schema of the chain."""
@@ -461,7 +496,6 @@ def save(  # type: ignore[override]
         version: Optional[int] = None,
         description: Optional[str] = None,
         attrs: Optional[list[str]] = None,
-        delta: Optional[bool] = False,
         **kwargs,
     ) -> "Self":
         """Save to a Dataset. It returns the chain itself.
@@ -488,7 +522,7 @@ def save(  # type: ignore[override]
                 source while deleted records are not removed in the new dataset version.
         """
         schema = self.signals_schema.clone_without_sys_signals().serialize()
-        if delta and name:
+        if self.delta and name:
             delta_ds = delta_update(self, name)
             if delta_ds:
                 return self._evolve(
@@ -620,6 +654,7 @@ def gen(
             signal_schema=udf_obj.output,
         )
 
+    @delta_disabled
     def agg(
         self,
         func: Optional[Callable] = None,
@@ -773,6 +808,7 @@ def order_by(self, *args, descending: bool = False) -> "Self":
 
         return self._evolve(query=self._query.order_by(*args))
 
+    @delta_disabled
     def distinct(self, arg: str, *args: str) -> "Self":  # type: ignore[override]
         """Removes duplicate rows based on uniqueness of some input column(s)
         i.e if rows are found with the same value of input column(s), only one
@@ -807,6 +843,7 @@ def select_except(self, *args: str) -> "Self":
             query=self._query.select(*columns), signal_schema=new_schema
         )
 
+    @delta_disabled  # type: ignore[arg-type]
     def group_by(
         self,
         *,
@@ -1165,6 +1202,7 @@ def remove_file_signals(self) -> "Self":
         schema = self.signals_schema.clone_without_file_signals()
         return self.select(*schema.values.keys())
 
+    @delta_disabled
     def merge(
         self,
         right_ds: "DataChain",
@@ -1273,6 +1311,7 @@ def _resolve(
 
         return ds
 
+    @delta_disabled
     def union(self, other: "Self") -> "Self":
         """Return the set union of the two datasets.
 
diff --git a/src/datachain/lib/dc/datasets.py b/src/datachain/lib/dc/datasets.py
index 1765a92d7..12b228b95 100644
--- a/src/datachain/lib/dc/datasets.py
+++ b/src/datachain/lib/dc/datasets.py
@@ -26,6 +26,7 @@ def read_dataset(
     session: Optional[Session] = None,
     settings: Optional[dict] = None,
     fallback_to_studio: bool = True,
+    delta: bool = False,
 ) -> "DataChain":
     """Get data from a saved Dataset. It returns the chain itself.
     If dataset or version is not found locally, it will try to pull it from Studio.
@@ -37,6 +38,21 @@ def read_dataset(
         settings : Settings to use for the chain.
         fallback_to_studio : Try to pull dataset from Studio if not found locally.
             Default is True.
+        delta : If True, we optimize on creation of the new dataset versions
+            by calculating diff between last version of this dataset and the version
+            with which last version of resulting chain dataset (the one specified in
+            `.save()`) was created.
+            We then run the "diff" chain with this diff data returned instead of
+            all dataset data, and we union that diff chain with last version of
+            resulting dataset creating new version of it.
+            This way we avoid applying modifications to all records from dataset
+            every time since that can be expensive operation.
+            Dataset needs to have File object in schema.
+            Diff is calculated using `DataChain.diff()` method which looks into
+            File `source` and `path` for matching, and File `version` and `etag`
+            for checking if the record is changed.
+            Note that this takes in account only added and changed records in
+            dataset while deleted records are not removed in the new dataset version.
 
     Example:
         ```py
@@ -92,7 +108,7 @@ def read_dataset(
         signals_schema |= SignalSchema.deserialize(query.feature_schema)
     else:
         signals_schema |= SignalSchema.from_column_types(query.column_types or {})
-    return DataChain(query, _settings, signals_schema)
+    return DataChain(query, _settings, signals_schema).as_delta(delta)
 
 
 def datasets(
diff --git a/src/datachain/lib/dc/storage.py b/src/datachain/lib/dc/storage.py
index 551ef160a..9814d1f89 100644
--- a/src/datachain/lib/dc/storage.py
+++ b/src/datachain/lib/dc/storage.py
@@ -32,6 +32,7 @@ def read_storage(
     column: str = "file",
     update: bool = False,
     anon: bool = False,
+    delta: bool = False,
     client_config: Optional[dict] = None,
 ) -> "DataChain":
     """Get data from storage(s) as a list of file with all file attributes.
@@ -47,6 +48,21 @@ def read_storage(
         update : force storage reindexing. Default is False.
         anon : If True, we will treat cloud bucket as public one
         client_config : Optional client configuration for the storage client.
+        delta : If True, we optimize on creation of the new dataset versions
+            by calculating diff between last version of this storage and the version
+            with which last version of resulting chain dataset (the one specified in
+            `.save()`) was created.
+            We then run the "diff" chain with this diff data returned instead of
+            all storage data, and we union that diff chain with last version of
+            resulting dataset creating new version of it.
+            This way we avoid applying modifications to all records from storage
+            every time since that can be expensive operation.
+            Dataset needs to have File object in schema.
+            Diff is calculated using `DataChain.diff()` method which looks into
+            File `source` and `path` for matching, and File `version` and `etag`
+            for checking if the record is changed.
+            Note that this takes in account only added and changed records in
+            storage while deleted records are not removed in the new dataset version.
 
     Returns:
         DataChain: A DataChain object containing the file information.
@@ -122,7 +138,7 @@ def read_storage(
             )
             continue
 
-        dc = read_dataset(list_ds_name, session=session, settings=settings)
+        dc = read_dataset(list_ds_name, session=session, settings=settings, delta=delta)
         dc._query.update = update
         dc.signals_schema = dc.signals_schema.mutate({f"{column}": file_type})
 
@@ -151,7 +167,7 @@ def lst_fn(ds_name, lst_uri):
 
         chain = ls(dc, list_path, recursive=recursive, column=column)
 
-        storage_chain = storage_chain.union(chain) if storage_chain else chain
+        storage_chain = storage_chain.union(chain) if storage_chain else chain  # type: ignore[attr-defined]
         listed_ds_name.add(list_ds_name)
 
     if file_values:
diff --git a/tests/func/test_delta.py b/tests/func/test_delta.py
index 74d7f78d0..e14944694 100644
--- a/tests/func/test_delta.py
+++ b/tests/func/test_delta.py
@@ -31,10 +31,9 @@ def create_image_dataset(ds_name, images):
         ).save(ds_name)
 
     def create_delta_dataset(ds_name):
-        dc.read_dataset(
-            starting_ds_name,
-            session=test_session,
-        ).save(ds_name, delta=True)
+        dc.read_dataset(starting_ds_name, session=test_session, delta=True).save(
+            ds_name
+        )
 
     # first version of starting dataset
     create_image_dataset(starting_ds_name, images[:2])
@@ -89,13 +88,13 @@ def get_index(file: File) -> int:
             return int(re.search(r, file.path).group(1))  # type: ignore[union-attr]
 
         (
-            dc.read_storage(path, update=True, session=test_session)
+            dc.read_storage(path, update=True, session=test_session, delta=True)
             .filter(C("file.path").glob("*.jpg"))
             .map(emb=my_embedding)
             .mutate(dist=func.cosine_distance("emb", (0.1, 0.2)))
             .map(index=get_index)
             .filter(C("index") > 3)
-            .save(ds_name, delta=True)
+            .save(ds_name)
         )
 
     # first version of delta dataset
@@ -173,11 +172,11 @@ def get_index(file: File) -> int:
             return int(re.search(r, file.path).group(1))  # type: ignore[union-attr]
 
         (
-            dc.read_storage(path, update=True, session=test_session)
+            dc.read_storage(path, update=True, session=test_session, delta=True)
             .filter(C("file.path").glob("*.jpg"))
             .map(index=get_index)
             .filter(C("index") > 5)
-            .save(ds_name, delta=True)
+            .save(ds_name)
         )
 
     create_delta_dataset()
@@ -212,9 +211,80 @@ def test_delta_update_no_file_signals(test_session):
         dc.read_dataset(
             starting_ds_name,
             session=test_session,
-        ).save("delta_ds", delta=True)
+            delta=True,
+        ).save("delta_ds")
 
     assert (
         str(excinfo.value)
         == "Chain doesn't produce file signal, cannot do delta update"
     )
+
+
+@pytest.fixture
+def file_dataset(test_session):
+    return dc.read_values(
+        file=[
+            File(path="a.jpg", source="s3://bucket"),
+            File(path="b.jpg", source="s3://bucket"),
+        ],
+        session=test_session,
+    ).save("file_ds")
+
+
+def test_delta_update_union(test_session, file_dataset):
+    dc.read_values(num=[10, 20], session=test_session).save("numbers")
+
+    with pytest.raises(NotImplementedError) as excinfo:
+        (
+            dc.read_dataset(file_dataset.name, session=test_session, delta=True).union(
+                dc.read_dataset("numbers"), session=test_session
+            )
+        )
+
+    assert str(excinfo.value) == "Delta update cannot be used with union"
+
+
+def test_delta_update_merge(test_session, file_dataset):
+    dc.read_values(num=[10, 20], session=test_session).save("numbers")
+
+    with pytest.raises(NotImplementedError) as excinfo:
+        (
+            dc.read_dataset(file_dataset.name, session=test_session, delta=True).merge(
+                dc.read_dataset("numbers"), on="id", session=test_session
+            )
+        )
+
+    assert str(excinfo.value) == "Delta update cannot be used with merge"
+
+
+def test_delta_update_distinct(test_session, file_dataset):
+    with pytest.raises(NotImplementedError) as excinfo:
+        (
+            dc.read_dataset(
+                file_dataset.name, session=test_session, delta=True
+            ).distinct("file.path")
+        )
+
+    assert str(excinfo.value) == "Delta update cannot be used with distinct"
+
+
+def test_delta_update_group_by(test_session, file_dataset):
+    with pytest.raises(NotImplementedError) as excinfo:
+        (
+            dc.read_dataset(
+                file_dataset.name, session=test_session, delta=True
+            ).group_by(cnt=func.count(), partition_by="file.path")
+        )
+
+    assert str(excinfo.value) == "Delta update cannot be used with group_by"
+
+
+def test_delta_update_agg(test_session, file_dataset):
+    with pytest.raises(NotImplementedError) as excinfo:
+        (
+            dc.read_dataset(file_dataset.name, session=test_session, delta=True).agg(
+                cnt=func.count(), partition_by="file.path"
+            )
+        )
+
+    assert str(excinfo.value) == "Delta update cannot be used with agg"

From 723a1a65f9130b93bf13da441d64d738392cb5c7 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Fri, 25 Apr 2025 16:23:50 +0200
Subject: [PATCH 31/45] not creating dataset if diff is empty

---
 src/datachain/delta.py            | 11 +++++++----
 src/datachain/lib/dc/datachain.py | 21 +++++++++++++++++---
 src/datachain/lib/dc/storage.py   |  4 ++--
 tests/func/test_delta.py          | 32 ++++++++++++++-----------------
 tests/unit/lib/test_datachain.py  |  4 ++++
 5 files changed, 45 insertions(+), 27 deletions(-)

diff --git a/src/datachain/delta.py b/src/datachain/delta.py
index 1fd116a35..a9244e014 100644
--- a/src/datachain/delta.py
+++ b/src/datachain/delta.py
@@ -17,7 +17,7 @@ def _append_steps(dc: "DataChain", other: "DataChain"):
     return dc
 
 
-def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]:
+def delta_update(dc: "DataChain", name: str) -> tuple[Optional["DataChain"], bool]:
     """
     Creates new chain that consists of the last version of current delta dataset
     plus diff from the source with all needed modifications.
@@ -38,7 +38,7 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]:
         latest_version = catalog.get_dataset(name).latest_version
     except DatasetNotFoundError:
         # first creation of delta update dataset
-        return None
+        return None, True
 
     dependencies = catalog.get_dataset_dependencies(
         name, latest_version, indirect=False
@@ -52,7 +52,7 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]:
     if not dep:
         # starting dataset (e.g listing) was removed so we are backing off to normal
         # dataset creation, as it was created first time
-        return None
+        return None, True
 
     source_ds_name = dep.name
     source_ds_version = int(dep.version)
@@ -68,6 +68,9 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]:
     # We append all the steps from the original chain to diff, e.g filters, mappers.
     diff = _append_steps(diff, dc)
 
+    if diff.is_empty():
+        return None, False
+
     # merging diff and the latest version of dataset
     return (
         datachain.read_dataset(name, latest_version)
@@ -79,4 +82,4 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]:
             modified=False,
         )
         .union(diff)
-    )
+    ), True
diff --git a/src/datachain/lib/dc/datachain.py b/src/datachain/lib/dc/datachain.py
index d0490dd33..003830773 100644
--- a/src/datachain/lib/dc/datachain.py
+++ b/src/datachain/lib/dc/datachain.py
@@ -291,7 +291,7 @@ def _evolve(
             _sys = self._sys
         return type(self)(
             query, settings, signal_schema=signal_schema, setup=self._setup, _sys=_sys
-        )
+        ).as_delta(self.delta)
 
     def settings(
         self,
@@ -497,7 +497,7 @@ def save(  # type: ignore[override]
         description: Optional[str] = None,
         attrs: Optional[list[str]] = None,
         **kwargs,
-    ) -> "Self":
+    ) -> "DataChain":
         """Save to a Dataset. It returns the chain itself.
 
         Parameters:
@@ -523,13 +523,24 @@ def save(  # type: ignore[override]
         """
         schema = self.signals_schema.clone_without_sys_signals().serialize()
         if self.delta and name:
-            delta_ds = delta_update(self, name)
+            delta_ds, has_changes = delta_update(self, name)
+
             if delta_ds:
                 return self._evolve(
                     query=delta_ds._query.save(
                         name=name, version=version, feature_schema=schema, **kwargs
                     )
                 )
+
+            if not has_changes:
+                # sources have not been changed so new version of resulting dataset
+                # would be the same as previous one. To avoid duplicating exact
+                # datasets, we won't create new version of it and we will return
+                # current latest version instead.
+                from .datasets import read_dataset
+
+                return read_dataset(name, **kwargs)
+
         return self._evolve(
             query=self._query.save(
                 name=name,
@@ -2208,6 +2219,10 @@ def count(self) -> int:
         """Return the number of rows in the chain."""
         return self._query.count()
 
+    def is_empty(self) -> bool:
+        """Returns True if chain has zero number of rows"""
+        return not bool(self.count())
+
     def exec(self) -> "Self":
         """Execute the chain."""
         return self._evolve(query=self._query.exec())
diff --git a/src/datachain/lib/dc/storage.py b/src/datachain/lib/dc/storage.py
index 9814d1f89..20599e450 100644
--- a/src/datachain/lib/dc/storage.py
+++ b/src/datachain/lib/dc/storage.py
@@ -138,7 +138,7 @@ def read_storage(
             )
             continue
 
-        dc = read_dataset(list_ds_name, session=session, settings=settings, delta=delta)
+        dc = read_dataset(list_ds_name, session=session, settings=settings)
         dc._query.update = update
         dc.signals_schema = dc.signals_schema.mutate({f"{column}": file_type})
 
@@ -184,4 +184,4 @@ def lst_fn(ds_name, lst_uri):
 
     assert storage_chain is not None
 
-    return storage_chain
+    return storage_chain.as_delta(delta)
diff --git a/tests/func/test_delta.py b/tests/func/test_delta.py
index e14944694..28da6e3a4 100644
--- a/tests/func/test_delta.py
+++ b/tests/func/test_delta.py
@@ -6,6 +6,7 @@
 
 import datachain as dc
 from datachain import func
+from datachain.error import DatasetVersionNotFoundError
 from datachain.lib.dc import C
 from datachain.lib.file import File, ImageFile
 
@@ -182,24 +183,19 @@ def get_index(file: File) -> int:
     create_delta_dataset()
     create_delta_dataset()
 
-    assert (
-        list(
-            dc.read_dataset(ds_name, version=1)
-            .order_by("file.path")
-            .collect("file.path")
-        )
-        == list(
-            dc.read_dataset(ds_name, version=2)
-            .order_by("file.path")
-            .collect("file.path")
-        )
-        == [
-            "images/img6.jpg",
-            "images/img7.jpg",
-            "images/img8.jpg",
-            "images/img9.jpg",
-        ]
-    )
+    assert list(
+        dc.read_dataset(ds_name, version=1).order_by("file.path").collect("file.path")
+    ) == [
+        "images/img6.jpg",
+        "images/img7.jpg",
+        "images/img8.jpg",
+        "images/img9.jpg",
+    ]
+
+    with pytest.raises(DatasetVersionNotFoundError) as exc_info:
+        dc.read_dataset(ds_name, version=2)
+
+    assert str(exc_info.value) == f"Dataset {ds_name} does not have version 2"
 
 
 def test_delta_update_no_file_signals(test_session):
diff --git a/tests/unit/lib/test_datachain.py b/tests/unit/lib/test_datachain.py
index ddcb8d72c..2e1568aef 100644
--- a/tests/unit/lib/test_datachain.py
+++ b/tests/unit/lib/test_datachain.py
@@ -274,6 +274,10 @@ def test_read_record_empty_chain_without_schema(test_session):
     )
 
 
+def test_is_empty(test_session):
+    assert dc.read_records([], schema=None, session=test_session).is_empty() is True
+
+
 def test_empty_chain_skip_udf_run(test_session):
     # Test that UDF is not called for empty chain
     with patch.object(UDFAdapter, "run") as mock_udf_run:

From c670e33e48ad032dd7aaa0b1ce23aa76d0f61633 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Mon, 28 Apr 2025 12:24:47 +0200
Subject: [PATCH 32/45] adding diff persist to avoid re-calculation of diff and
 removing obsolete delta comments

---
 src/datachain/delta.py            |  2 ++
 src/datachain/lib/dc/datachain.py | 14 --------------
 2 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/src/datachain/delta.py b/src/datachain/delta.py
index a9244e014..70647b260 100644
--- a/src/datachain/delta.py
+++ b/src/datachain/delta.py
@@ -71,6 +71,8 @@ def delta_update(dc: "DataChain", name: str) -> tuple[Optional["DataChain"], boo
     if diff.is_empty():
         return None, False
 
+    diff = diff.persist()
+
     # merging diff and the latest version of dataset
     return (
         datachain.read_dataset(name, latest_version)
diff --git a/src/datachain/lib/dc/datachain.py b/src/datachain/lib/dc/datachain.py
index 003830773..27a7b415f 100644
--- a/src/datachain/lib/dc/datachain.py
+++ b/src/datachain/lib/dc/datachain.py
@@ -506,20 +506,6 @@ def save(  # type: ignore[override]
             description : description of a dataset.
             attrs : attributes of a dataset. They can be without value, e.g "NLP",
                 or with a value, e.g "location=US".
-            delta : If True, we optimize on creation of the new dataset versions
-                by calculating diff between source and the last version of dataset
-                and applying all needed modifications (mappers, filters etc.) only
-                on that diff.
-                Then we merge modified diff with the last version of dataset to
-                create new version. This way we avoid applying modifications to all
-                records from source every time since that can be expensive operation.
-                Source can be cloud storage or other dataset which has File object
-                in schema.
-                Diff is calculated using `DataChain.diff()` method which looks into
-                File `source` and `path` for matching, and File `version` and `etag`
-                for checking if the record is changed.
-                Note that this takes in account only added and changed records in
-                source while deleted records are not removed in the new dataset version.
         """
         schema = self.signals_schema.clone_without_sys_signals().serialize()
         if self.delta and name:

From 773b22d0de63851448cf58e593d45d7fec089bb2 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Mon, 28 Apr 2025 12:29:38 +0200
Subject: [PATCH 33/45] adding count after persist to avoid re-calculating diff
 twice

---
 src/datachain/delta.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/datachain/delta.py b/src/datachain/delta.py
index 70647b260..f5124edc3 100644
--- a/src/datachain/delta.py
+++ b/src/datachain/delta.py
@@ -68,11 +68,12 @@ def delta_update(dc: "DataChain", name: str) -> tuple[Optional["DataChain"], boo
     # We append all the steps from the original chain to diff, e.g filters, mappers.
     diff = _append_steps(diff, dc)
 
+    # to avoid re-calculating diff multiple times
+    diff = diff.persist()
+
     if diff.is_empty():
         return None, False
 
-    diff = diff.persist()
-
     # merging diff and the latest version of dataset
     return (
         datachain.read_dataset(name, latest_version)

From e8de5f29a767a1af0c4b354a8c674e3311682e77 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Mon, 28 Apr 2025 13:19:44 +0200
Subject: [PATCH 34/45] moving ad_delta to private and fixing delta docs

---
 src/datachain/lib/dc/datachain.py |  4 ++--
 src/datachain/lib/dc/datasets.py  | 32 +++++++++++++++---------------
 src/datachain/lib/dc/storage.py   | 33 ++++++++++++++++---------------
 3 files changed, 35 insertions(+), 34 deletions(-)

diff --git a/src/datachain/lib/dc/datachain.py b/src/datachain/lib/dc/datachain.py
index 27a7b415f..6f8a3dc40 100644
--- a/src/datachain/lib/dc/datachain.py
+++ b/src/datachain/lib/dc/datachain.py
@@ -202,7 +202,7 @@ def __repr__(self) -> str:
         self.print_schema(file=file)
         return file.getvalue()
 
-    def as_delta(self, delta: bool = False) -> "Self":
+    def _as_delta(self, delta: bool = False) -> "Self":
         """Marks this chain as delta, which means special delta process will be
         called on saving dataset for optimization"""
         self._delta = delta
@@ -291,7 +291,7 @@ def _evolve(
             _sys = self._sys
         return type(self)(
             query, settings, signal_schema=signal_schema, setup=self._setup, _sys=_sys
-        ).as_delta(self.delta)
+        )._as_delta(self.delta)
 
     def settings(
         self,
diff --git a/src/datachain/lib/dc/datasets.py b/src/datachain/lib/dc/datasets.py
index 12b228b95..a82e92ab8 100644
--- a/src/datachain/lib/dc/datasets.py
+++ b/src/datachain/lib/dc/datasets.py
@@ -38,21 +38,21 @@ def read_dataset(
         settings : Settings to use for the chain.
         fallback_to_studio : Try to pull dataset from Studio if not found locally.
             Default is True.
-        delta : If True, we optimize on creation of the new dataset versions
-            by calculating diff between last version of this dataset and the version
-            with which last version of resulting chain dataset (the one specified in
-            `.save()`) was created.
-            We then run the "diff" chain with this diff data returned instead of
-            all dataset data, and we union that diff chain with last version of
-            resulting dataset creating new version of it.
-            This way we avoid applying modifications to all records from dataset
-            every time since that can be expensive operation.
-            Dataset needs to have File object in schema.
-            Diff is calculated using `DataChain.diff()` method which looks into
-            File `source` and `path` for matching, and File `version` and `etag`
-            for checking if the record is changed.
-            Note that this takes in account only added and changed records in
-            dataset while deleted records are not removed in the new dataset version.
+        delta: If True, we optimize the creation of new dataset versions by calculating
+            the diff between the latest version of this dataset and the version used
+            to create the most recent version of the resulting chain dataset (the one
+            specified in .save()).
+            We then run the "diff" chain using only the diff data, instead of the
+            entire dataset, and merge that diff chain with the latest version of the
+            resulting dataset to create a new version.
+            This approach avoids modifying all records in the dataset every time,
+            which can be an expensive operation.
+            The dataset schema must include a File object.
+            The diff is calculated using the DataChain.diff() method, which compares
+            the source and path fields of File objects to find matches, and checks
+            the version and etag fields to determine if a record has changed.
+            Note that this process only accounts for added and modified records in
+            the dataset. Deleted records are not removed in the new dataset version.
 
     Example:
         ```py
@@ -108,7 +108,7 @@ def read_dataset(
         signals_schema |= SignalSchema.deserialize(query.feature_schema)
     else:
         signals_schema |= SignalSchema.from_column_types(query.column_types or {})
-    return DataChain(query, _settings, signals_schema).as_delta(delta)
+    return DataChain(query, _settings, signals_schema)._as_delta(delta)
 
 
 def datasets(
diff --git a/src/datachain/lib/dc/storage.py b/src/datachain/lib/dc/storage.py
index 20599e450..743d1d012 100644
--- a/src/datachain/lib/dc/storage.py
+++ b/src/datachain/lib/dc/storage.py
@@ -48,21 +48,22 @@ def read_storage(
         update : force storage reindexing. Default is False.
         anon : If True, we will treat cloud bucket as public one
         client_config : Optional client configuration for the storage client.
-        delta : If True, we optimize on creation of the new dataset versions
-            by calculating diff between last version of this storage and the version
-            with which last version of resulting chain dataset (the one specified in
-            `.save()`) was created.
-            We then run the "diff" chain with this diff data returned instead of
-            all storage data, and we union that diff chain with last version of
-            resulting dataset creating new version of it.
-            This way we avoid applying modifications to all records from storage
-            every time since that can be expensive operation.
-            Dataset needs to have File object in schema.
-            Diff is calculated using `DataChain.diff()` method which looks into
-            File `source` and `path` for matching, and File `version` and `etag`
-            for checking if the record is changed.
-            Note that this takes in account only added and changed records in
-            storage while deleted records are not removed in the new dataset version.
+        delta: If True, we optimize the creation of new dataset versions by calculating
+            the diff between the latest version of this storage and the version used to
+            create the most recent version of the resulting chain dataset (the one
+            specified in .save()).
+            We then run the "diff" chain using only the diff data, rather than the
+            entire storage data, and merge that diff chain with the latest version
+            of the resulting dataset to create a new version.
+            This approach avoids applying modifications to all records from storage
+            every time, which can be an expensive operation.
+            The dataset schema must include a File object.
+            The diff is calculated using the DataChain.diff() method, which compares
+            the source and path fields of File objects to find matches, and checks the
+            version and etag fields to determine if a record has changed.
+            Note that this process only considers added and modified records in
+            storage.
+            Deleted records are not removed from the new dataset version.
 
     Returns:
         DataChain: A DataChain object containing the file information.
@@ -184,4 +185,4 @@ def lst_fn(ds_name, lst_uri):
 
     assert storage_chain is not None
 
-    return storage_chain.as_delta(delta)
+    return storage_chain._as_delta(delta)

From e8d6f2dfd147c259c0ba54ac7d88ca0aec542c60 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Mon, 28 Apr 2025 13:47:17 +0200
Subject: [PATCH 35/45] removing not reachable codebase

---
 src/datachain/delta.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/datachain/delta.py b/src/datachain/delta.py
index f5124edc3..9b2b61802 100644
--- a/src/datachain/delta.py
+++ b/src/datachain/delta.py
@@ -43,10 +43,6 @@ def delta_update(dc: "DataChain", name: str) -> tuple[Optional["DataChain"], boo
     dependencies = catalog.get_dataset_dependencies(
         name, latest_version, indirect=False
     )
-    if len(dependencies) > 1:
-        raise Exception(
-            "Cannot do delta with dataset that has multiple direct dependencies"
-        )
 
     dep = dependencies[0]
     if not dep:

From 803345ae88355295d371e162059cfe4029ca8e61 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Mon, 28 Apr 2025 15:19:18 +0200
Subject: [PATCH 36/45] fixing lint issue

---
 src/datachain/lib/dc/storage.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/datachain/lib/dc/storage.py b/src/datachain/lib/dc/storage.py
index 743d1d012..0722a0486 100644
--- a/src/datachain/lib/dc/storage.py
+++ b/src/datachain/lib/dc/storage.py
@@ -1,4 +1,5 @@
 import os.path
+from functools import reduce
 from typing import (
     TYPE_CHECKING,
     Optional,
@@ -122,7 +123,7 @@ def read_storage(
     if not uris:
         raise ValueError("No URIs provided")
 
-    storage_chain = None
+    chains = []
     listed_ds_name = set()
     file_values = []
 
@@ -166,11 +167,11 @@ def lst_fn(ds_name, lst_uri):
                 lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri)
             )
 
-        chain = ls(dc, list_path, recursive=recursive, column=column)
-
-        storage_chain = storage_chain.union(chain) if storage_chain else chain  # type: ignore[attr-defined]
+        chains.append(ls(dc, list_path, recursive=recursive, column=column))
         listed_ds_name.add(list_ds_name)
 
+    storage_chain = None if not chains else reduce(lambda x, y: x.union(y), chains)
+
     if file_values:
         file_chain = read_values(
             session=session,

From 08a4c1bea6fdcc5a383aaee51ad4e23bf17b78a5 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Mon, 28 Apr 2025 15:46:14 +0200
Subject: [PATCH 37/45] added test to check num of processing calls

---
 tests/func/test_delta.py | 44 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/tests/func/test_delta.py b/tests/func/test_delta.py
index 28da6e3a4..b6cbca081 100644
--- a/tests/func/test_delta.py
+++ b/tests/func/test_delta.py
@@ -153,6 +153,50 @@ def get_index(file: File) -> int:
     )
 
 
+def test_delta_update_check_num_calls(test_session, tmp_dir, tmp_path, capsys):
+    ds_name = "delta_ds"
+    path = tmp_dir.as_uri()
+    tmp_dir = tmp_dir / "images"
+    os.mkdir(tmp_dir)
+    map_print = "In map"
+
+    images = [
+        {
+            "name": f"img{i}.jpg",
+            "data": Image.new(mode="RGB", size=((i + 1) * 10, (i + 1) * 10)),
+        }
+        for i in range(20)
+    ]
+
+    # save only half of the images for now
+    for img in images[:10]:
+        img["data"].save(tmp_dir / img["name"])
+
+    def create_delta_dataset():
+        def get_index(file: File) -> int:
+            print(map_print)  # needed to count number of map calls
+            r = r".+\/img(\d+)\.jpg"
+            return int(re.search(r, file.path).group(1))  # type: ignore[union-attr]
+
+        (
+            dc.read_storage(path, update=True, session=test_session, delta=True)
+            .map(index=get_index)
+            .save(ds_name)
+        )
+
+    # first version of delta dataset
+    create_delta_dataset()
+    # save other half of images
+    for img in images[10:]:
+        img["data"].save(tmp_dir / img["name"])
+    # second version of delta dataset
+    create_delta_dataset()
+
+    captured = capsys.readouterr()
+    # assert captured.out == "Garbage collecting 2 tables.\n"
+    assert captured.out == "\n".join([map_print] * 20) + "\n"
+
+
 def test_delta_update_no_diff(test_session, tmp_dir, tmp_path):
     ds_name = "delta_ds"
     path = tmp_dir.as_uri()

From b0470ce3206782015de3528edd187c16a8e30c6f Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Mon, 28 Apr 2025 16:08:55 +0200
Subject: [PATCH 38/45] adding schema to diff instead of appending

---
 src/datachain/delta.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datachain/delta.py b/src/datachain/delta.py
index 9b2b61802..8a3855b29 100644
--- a/src/datachain/delta.py
+++ b/src/datachain/delta.py
@@ -13,7 +13,7 @@ def _append_steps(dc: "DataChain", other: "DataChain"):
     """
     dc = dc.clone()
     dc._query.steps += other._query.steps.copy()
-    dc.signals_schema = dc.signals_schema.append(other.signals_schema)
+    dc.signals_schema = other.signals_schema
     return dc
 
 

From 2ab17590385c1aad14249e991ae70e3a2fb2b43c Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Tue, 6 May 2025 11:39:03 +0200
Subject: [PATCH 39/45] moving delta_disabled to delta.py

---
 src/datachain/delta.py            | 30 +++++++++++++++++++++++++++++-
 src/datachain/lib/dc/datachain.py | 25 ++-----------------------
 2 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/src/datachain/delta.py b/src/datachain/delta.py
index 8a3855b29..fc0a7be90 100644
--- a/src/datachain/delta.py
+++ b/src/datachain/delta.py
@@ -1,11 +1,39 @@
-from typing import TYPE_CHECKING, Optional
+from functools import wraps
+from typing import TYPE_CHECKING, Callable, Optional, TypeVar
 
 import datachain
 from datachain.error import DatasetNotFoundError
 
 if TYPE_CHECKING:
+    from typing_extensions import Concatenate, ParamSpec
+
     from datachain.lib.dc import DataChain
 
+    P = ParamSpec("P")
+
+
+T = TypeVar("T", bound="DataChain")
+
+
+def delta_disabled(
+    method: "Callable[Concatenate[T, P], T]",
+) -> "Callable[Concatenate[T, P], T]":
+    """
+    Decorator for disabling DataChain methods (e.g `.agg()` or `.union()`) to
+    work with delta updates. It throws `NotImplementedError` if chain on which
+    method is called is marked as delta.
+    """
+
+    @wraps(method)
+    def _inner(self: T, *args: "P.args", **kwargs: "P.kwargs") -> T:
+        if self.delta:
+            raise NotImplementedError(
+                f"Delta update cannot be used with {method.__name__}"
+            )
+        return method(self, *args, **kwargs)
+
+    return _inner
+
 
 def _append_steps(dc: "DataChain", other: "DataChain"):
     """Returns cloned chain with appended steps from other chain.
diff --git a/src/datachain/lib/dc/datachain.py b/src/datachain/lib/dc/datachain.py
index 6f8a3dc40..d4b3440a2 100644
--- a/src/datachain/lib/dc/datachain.py
+++ b/src/datachain/lib/dc/datachain.py
@@ -4,7 +4,6 @@
 import sys
 import warnings
 from collections.abc import Iterator, Sequence
-from functools import wraps
 from typing import (
     IO,
     TYPE_CHECKING,
@@ -25,7 +24,7 @@
 from tqdm import tqdm
 
 from datachain.dataset import DatasetRecord
-from datachain.delta import delta_update
+from datachain.delta import delta_disabled, delta_update
 from datachain.func import literal
 from datachain.func.base import Function
 from datachain.func.func import Func
@@ -68,7 +67,7 @@
 
 if TYPE_CHECKING:
     import pandas as pd
-    from typing_extensions import Concatenate, ParamSpec, Self
+    from typing_extensions import ParamSpec, Self
 
     P = ParamSpec("P")
 
@@ -76,26 +75,6 @@
 T = TypeVar("T", bound="DataChain")
 
 
-def delta_disabled(
-    method: "Callable[Concatenate[T, P], T]",
-) -> "Callable[Concatenate[T, P], T]":
-    """
-    Decorator for disabling DataChain methods (e.g `.agg()` or `.union()`) to
-    work with delta updates. It throws `NotImplementedError` if chain on which
-    method is called is marked as delta.
-    """
-
-    @wraps(method)
-    def _inner(self: T, *args: "P.args", **kwargs: "P.kwargs") -> T:
-        if self.delta:
-            raise NotImplementedError(
-                f"Delta update cannot be used with {method.__name__}"
-            )
-        return method(self, *args, **kwargs)
-
-    return _inner
-
-
 class DataChain:
     """DataChain - a data structure for batch data processing and evaluation.
 

From 594ef7da2e245674e27a756bce81164fd13e2e4d Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Tue, 6 May 2025 12:26:08 +0200
Subject: [PATCH 40/45] moved is_empty to property empty

---
 src/datachain/delta.py            |  2 +-
 src/datachain/lib/dc/datachain.py | 10 ++++++----
 tests/unit/lib/test_datachain.py  |  4 ++--
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/datachain/delta.py b/src/datachain/delta.py
index fc0a7be90..efead5740 100644
--- a/src/datachain/delta.py
+++ b/src/datachain/delta.py
@@ -95,7 +95,7 @@ def delta_update(dc: "DataChain", name: str) -> tuple[Optional["DataChain"], boo
     # to avoid re-calculating diff multiple times
     diff = diff.persist()
 
-    if diff.is_empty():
+    if diff.empty:
         return None, False
 
     # merging diff and the latest version of dataset
diff --git a/src/datachain/lib/dc/datachain.py b/src/datachain/lib/dc/datachain.py
index d4b3440a2..c8ddf1a37 100644
--- a/src/datachain/lib/dc/datachain.py
+++ b/src/datachain/lib/dc/datachain.py
@@ -187,8 +187,14 @@ def _as_delta(self, delta: bool = False) -> "Self":
         self._delta = delta
         return self
 
+    @property
+    def empty(self) -> bool:
+        """Returns True if chain has zero number of rows"""
+        return not bool(self.count())
+
     @property
     def delta(self) -> bool:
+        """Returns True if this chain is ran in "delta" update mode"""
         return self._delta
 
     @property
@@ -2184,10 +2190,6 @@ def count(self) -> int:
         """Return the number of rows in the chain."""
         return self._query.count()
 
-    def is_empty(self) -> bool:
-        """Returns True if chain has zero number of rows"""
-        return not bool(self.count())
-
     def exec(self) -> "Self":
         """Execute the chain."""
         return self._evolve(query=self._query.exec())
diff --git a/tests/unit/lib/test_datachain.py b/tests/unit/lib/test_datachain.py
index b511e8533..4af272925 100644
--- a/tests/unit/lib/test_datachain.py
+++ b/tests/unit/lib/test_datachain.py
@@ -274,8 +274,8 @@ def test_read_record_empty_chain_without_schema(test_session):
     )
 
 
-def test_is_empty(test_session):
-    assert dc.read_records([], schema=None, session=test_session).is_empty() is True
+def test_empty(test_session):
+    assert dc.read_records([], schema=None, session=test_session).empty is True
 
 
 def test_empty_chain_skip_udf_run(test_session):

From 567d63f347e40610fd8abe4645013c39dfcdf408 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Thu, 8 May 2025 02:16:54 +0200
Subject: [PATCH 41/45] adding custom fields to calculate diff in delta update

---
 src/datachain/delta.py            | 27 ++++-----
 src/datachain/diff/__init__.py    |  6 +-
 src/datachain/lib/dc/datachain.py | 34 ++++++++++--
 src/datachain/lib/dc/datasets.py  | 55 ++++++++++++------
 src/datachain/lib/dc/storage.py   | 52 +++++++++++------
 tests/func/test_delta.py          | 92 +++++++++++++++++++------------
 6 files changed, 177 insertions(+), 89 deletions(-)

diff --git a/src/datachain/delta.py b/src/datachain/delta.py
index efead5740..05c03505d 100644
--- a/src/datachain/delta.py
+++ b/src/datachain/delta.py
@@ -1,5 +1,6 @@
+from collections.abc import Sequence
 from functools import wraps
-from typing import TYPE_CHECKING, Callable, Optional, TypeVar
+from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
 
 import datachain
 from datachain.error import DatasetNotFoundError
@@ -45,7 +46,13 @@ def _append_steps(dc: "DataChain", other: "DataChain"):
     return dc
 
 
-def delta_update(dc: "DataChain", name: str) -> tuple[Optional["DataChain"], bool]:
+def delta_update(
+    dc: "DataChain",
+    name: str,
+    on: Union[str, Sequence[str]],
+    right_on: Optional[Union[str, Sequence[str]]] = None,
+    compare: Optional[Union[str, Sequence[str]]] = None,
+) -> tuple[Optional["DataChain"], bool]:
     """
     Creates new chain that consists of the last version of current delta dataset
     plus diff from the source with all needed modifications.
@@ -58,10 +65,6 @@ def delta_update(dc: "DataChain", name: str) -> tuple[Optional["DataChain"], boo
     catalog = dc.session.catalog
     dc._query.apply_listing_pre_step()
 
-    chain_file_signal = dc.signals_schema.get_file_signal()
-    if not chain_file_signal:
-        raise ValueError("Chain doesn't produce file signal, cannot do delta update")
-
     try:
         latest_version = catalog.get_dataset(name).latest_version
     except DatasetNotFoundError:
@@ -84,11 +87,8 @@ def delta_update(dc: "DataChain", name: str) -> tuple[Optional["DataChain"], boo
 
     source_dc = datachain.read_dataset(source_ds_name, source_ds_version)
     source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version)
-    source_file_signal = source_dc.signals_schema.get_file_signal()
-    if not source_file_signal:
-        raise ValueError("Source dataset doesn't have file signals")
 
-    diff = source_dc_latest.diff(source_dc, on=source_file_signal)
+    diff = source_dc_latest.compare(source_dc, on=on, compare=compare)
     # We append all the steps from the original chain to diff, e.g filters, mappers.
     diff = _append_steps(diff, dc)
 
@@ -101,10 +101,11 @@ def delta_update(dc: "DataChain", name: str) -> tuple[Optional["DataChain"], boo
     # merging diff and the latest version of dataset
     return (
         datachain.read_dataset(name, latest_version)
-        .diff(
+        .compare(
             diff,
-            on=chain_file_signal,
-            right_on=source_file_signal,
+            on=on,
+            compare=compare,
+            right_on=right_on,
             added=True,
             modified=False,
         )
diff --git a/src/datachain/diff/__init__.py b/src/datachain/diff/__init__.py
index 161c72f34..93451a66d 100644
--- a/src/datachain/diff/__init__.py
+++ b/src/datachain/diff/__init__.py
@@ -77,14 +77,16 @@ def _to_list(obj: Optional[Union[str, Sequence[str]]]) -> Optional[list[str]]:
     cols_select = list(left.signals_schema.clone_without_sys_signals().values.keys())
 
     # getting correct on and right_on column names
+    on_ = on
     on = left.signals_schema.resolve(*on).db_signals()  # type: ignore[assignment]
-    right_on = right.signals_schema.resolve(*(right_on or on)).db_signals()  # type: ignore[assignment]
+    right_on = right.signals_schema.resolve(*(right_on or on_)).db_signals()  # type: ignore[assignment]
 
     # getting correct compare and right_compare column names if they are defined
     if compare:
+        compare_ = compare
         compare = left.signals_schema.resolve(*compare).db_signals()  # type: ignore[assignment]
         right_compare = right.signals_schema.resolve(
-            *(right_compare or compare)
+            *(right_compare or compare_)
         ).db_signals()  # type: ignore[assignment]
     elif not compare and len(cols) != len(right_cols):
         # here we will mark all rows that are not added or deleted as modified since
diff --git a/src/datachain/lib/dc/datachain.py b/src/datachain/lib/dc/datachain.py
index c8ddf1a37..6f2a4c05a 100644
--- a/src/datachain/lib/dc/datachain.py
+++ b/src/datachain/lib/dc/datachain.py
@@ -181,10 +181,20 @@ def __repr__(self) -> str:
         self.print_schema(file=file)
         return file.getvalue()
 
-    def _as_delta(self, delta: bool = False) -> "Self":
+    def _as_delta(
+        self,
+        on: Optional[Union[str, Sequence[str]]] = None,
+        right_on: Optional[Union[str, Sequence[str]]] = None,
+        compare: Optional[Union[str, Sequence[str]]] = None,
+    ) -> "Self":
         """Marks this chain as delta, which means special delta process will be
         called on saving dataset for optimization"""
-        self._delta = delta
+        if on is None:
+            raise ValueError("'delta on' fields must be defined")
+        self._delta = True
+        self._delta_on = on
+        self._delta_right_on = right_on
+        self._delta_compare = compare
         return self
 
     @property
@@ -274,9 +284,17 @@ def _evolve(
             signal_schema = copy.deepcopy(self.signals_schema)
         if _sys is None:
             _sys = self._sys
-        return type(self)(
+        chain = type(self)(
             query, settings, signal_schema=signal_schema, setup=self._setup, _sys=_sys
-        )._as_delta(self.delta)
+        )
+        if self.delta:
+            chain = chain._as_delta(
+                on=self._delta_on,
+                right_on=self._delta_right_on,
+                compare=self._delta_compare,
+            )
+
+        return chain
 
     def settings(
         self,
@@ -494,7 +512,13 @@ def save(  # type: ignore[override]
         """
         schema = self.signals_schema.clone_without_sys_signals().serialize()
         if self.delta and name:
-            delta_ds, has_changes = delta_update(self, name)
+            delta_ds, has_changes = delta_update(
+                self,
+                name,
+                on=self._delta_on,
+                right_on=self._delta_right_on,
+                compare=self._delta_compare,
+            )
 
             if delta_ds:
                 return self._evolve(
diff --git a/src/datachain/lib/dc/datasets.py b/src/datachain/lib/dc/datasets.py
index a82e92ab8..073a0eacd 100644
--- a/src/datachain/lib/dc/datasets.py
+++ b/src/datachain/lib/dc/datasets.py
@@ -1,4 +1,5 @@
-from typing import TYPE_CHECKING, Optional, get_origin, get_type_hints
+from collections.abc import Sequence
+from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints
 
 from datachain.lib.dataset_info import DatasetInfo
 from datachain.lib.file import (
@@ -27,6 +28,10 @@ def read_dataset(
     settings: Optional[dict] = None,
     fallback_to_studio: bool = True,
     delta: bool = False,
+    delta_on: Optional[Union[str, Sequence[str]]] = None,
+    delta_right_on: Optional[Union[str, Sequence[str]]] = None,
+    delta_compare: Optional[Union[str, Sequence[str]]] = None,
+    # delta_right_compare: Optional[Union[str, Sequence[str]]] = None,
 ) -> "DataChain":
     """Get data from a saved Dataset. It returns the chain itself.
     If dataset or version is not found locally, it will try to pull it from Studio.
@@ -38,21 +43,32 @@ def read_dataset(
         settings : Settings to use for the chain.
         fallback_to_studio : Try to pull dataset from Studio if not found locally.
             Default is True.
-        delta: If True, we optimize the creation of new dataset versions by calculating
-            the diff between the latest version of this dataset and the version used
-            to create the most recent version of the resulting chain dataset (the one
-            specified in .save()).
-            We then run the "diff" chain using only the diff data, instead of the
-            entire dataset, and merge that diff chain with the latest version of the
-            resulting dataset to create a new version.
-            This approach avoids modifying all records in the dataset every time,
-            which can be an expensive operation.
-            The dataset schema must include a File object.
-            The diff is calculated using the DataChain.diff() method, which compares
-            the source and path fields of File objects to find matches, and checks
-            the version and etag fields to determine if a record has changed.
-            Note that this process only accounts for added and modified records in
-            the dataset. Deleted records are not removed in the new dataset version.
+        delta: If set to True, we optimize the creation of new dataset versions by
+            calculating the diff between the latest version of this storage and the
+            version used to create the most recent version of the resulting chain
+            dataset (the one specified in `.save()`). We then run the "diff" chain
+            using only the diff data, rather than the entire storage data, and merge
+            that diff chain with the latest version of the resulting dataset to create
+            a new version. This approach avoids applying modifications to all records
+            from storage every time, which can be an expensive operation.
+            The diff is calculated using the `DataChain.compare()` method, which
+            compares the `delta_on` fields to find matches and checks the compare
+            fields to determine if a record has changed. Note that this process only
+            considers added and modified records in storage; deleted records are not
+            removed from the new dataset version.
+            This calculation is based on the difference between the current version
+            of the source and the version used to create the dataset.
+        delta_on: A list of fields that uniquely identify rows in the source.
+            If two rows have the same values, they are considered the same (e.g., they
+            could be different versions of the same row in a versioned source).
+            This is used in the delta update to calculate the diff.
+        delta_right_on: A list of fields in the final dataset that correspond to the
+            `delta_on` fields if they were renamed.
+            There is no need to define this if the fields from `delta_on` are present
+            in the final dataset.
+        delta_compare: A list of fields used to check if the same row has been modified
+            in the new version of the source.
+            If not defined, all fields except those defined in delta_on will be used.
 
     Example:
         ```py
@@ -108,7 +124,12 @@ def read_dataset(
         signals_schema |= SignalSchema.deserialize(query.feature_schema)
     else:
         signals_schema |= SignalSchema.from_column_types(query.column_types or {})
-    return DataChain(query, _settings, signals_schema)._as_delta(delta)
+    chain = DataChain(query, _settings, signals_schema)
+    if delta:
+        chain = chain._as_delta(
+            on=delta_on, right_on=delta_right_on, compare=delta_compare
+        )
+    return chain
 
 
 def datasets(
diff --git a/src/datachain/lib/dc/storage.py b/src/datachain/lib/dc/storage.py
index 0722a0486..196de3140 100644
--- a/src/datachain/lib/dc/storage.py
+++ b/src/datachain/lib/dc/storage.py
@@ -1,4 +1,5 @@
 import os.path
+from collections.abc import Sequence
 from functools import reduce
 from typing import (
     TYPE_CHECKING,
@@ -34,6 +35,9 @@ def read_storage(
     update: bool = False,
     anon: bool = False,
     delta: bool = False,
+    delta_on: Optional[Union[str, Sequence[str]]] = None,
+    delta_right_on: Optional[Union[str, Sequence[str]]] = None,
+    delta_compare: Optional[Union[str, Sequence[str]]] = None,
     client_config: Optional[dict] = None,
 ) -> "DataChain":
     """Get data from storage(s) as a list of file with all file attributes.
@@ -49,22 +53,32 @@ def read_storage(
         update : force storage reindexing. Default is False.
         anon : If True, we will treat cloud bucket as public one
         client_config : Optional client configuration for the storage client.
-        delta: If True, we optimize the creation of new dataset versions by calculating
-            the diff between the latest version of this storage and the version used to
-            create the most recent version of the resulting chain dataset (the one
-            specified in .save()).
-            We then run the "diff" chain using only the diff data, rather than the
-            entire storage data, and merge that diff chain with the latest version
-            of the resulting dataset to create a new version.
-            This approach avoids applying modifications to all records from storage
-            every time, which can be an expensive operation.
-            The dataset schema must include a File object.
-            The diff is calculated using the DataChain.diff() method, which compares
-            the source and path fields of File objects to find matches, and checks the
-            version and etag fields to determine if a record has changed.
-            Note that this process only considers added and modified records in
-            storage.
-            Deleted records are not removed from the new dataset version.
+        delta: If set to True, we optimize the creation of new dataset versions by
+            calculating the diff between the latest version of this storage and the
+            version used to create the most recent version of the resulting chain
+            dataset (the one specified in `.save()`). We then run the "diff" chain
+            using only the diff data, rather than the entire storage data, and merge
+            that diff chain with the latest version of the resulting dataset to create
+            a new version. This approach avoids applying modifications to all records
+            from storage every time, which can be an expensive operation.
+            The diff is calculated using the `DataChain.compare()` method, which
+            compares the `delta_on` fields to find matches and checks the compare
+            fields to determine if a record has changed. Note that this process only
+            considers added and modified records in storage; deleted records are not
+            removed from the new dataset version.
+            This calculation is based on the difference between the current version
+            of the source and the version used to create the dataset.
+        delta_on: A list of fields that uniquely identify rows in the source.
+            If two rows have the same values, they are considered the same (e.g., they
+            could be different versions of the same row in a versioned source).
+            This is used in the delta update to calculate the diff.
+        delta_right_on: A list of fields in the final dataset that correspond to the
+            `delta_on` fields if they were renamed.
+            There is no need to define this if the fields from `delta_on` are present
+            in the final dataset.
+        delta_compare: A list of fields used to check if the same row has been modified
+            in the new version of the source.
+            If not defined, all fields except those defined in delta_on will be used.
 
     Returns:
         DataChain: A DataChain object containing the file information.
@@ -186,4 +200,8 @@ def lst_fn(ds_name, lst_uri):
 
     assert storage_chain is not None
 
-    return storage_chain._as_delta(delta)
+    if delta:
+        storage_chain = storage_chain._as_delta(
+            on=delta_on, right_on=delta_right_on, compare=delta_compare
+        )
+    return storage_chain
diff --git a/tests/func/test_delta.py b/tests/func/test_delta.py
index b6cbca081..9af749c10 100644
--- a/tests/func/test_delta.py
+++ b/tests/func/test_delta.py
@@ -32,9 +32,13 @@ def create_image_dataset(ds_name, images):
         ).save(ds_name)
 
     def create_delta_dataset(ds_name):
-        dc.read_dataset(starting_ds_name, session=test_session, delta=True).save(
-            ds_name
-        )
+        dc.read_dataset(
+            starting_ds_name,
+            session=test_session,
+            delta=True,
+            delta_on=["file.source", "file.path"],
+            delta_compare=["file.version", "file.etag"],
+        ).save(ds_name)
 
     # first version of starting dataset
     create_image_dataset(starting_ds_name, images[:2])
@@ -89,7 +93,14 @@ def get_index(file: File) -> int:
             return int(re.search(r, file.path).group(1))  # type: ignore[union-attr]
 
         (
-            dc.read_storage(path, update=True, session=test_session, delta=True)
+            dc.read_storage(
+                path,
+                update=True,
+                session=test_session,
+                delta=True,
+                delta_on=["file.source", "file.path"],
+                delta_compare=["file.version", "file.etag"],
+            )
             .filter(C("file.path").glob("*.jpg"))
             .map(emb=my_embedding)
             .mutate(dist=func.cosine_distance("emb", (0.1, 0.2)))
@@ -179,7 +190,14 @@ def get_index(file: File) -> int:
             return int(re.search(r, file.path).group(1))  # type: ignore[union-attr]
 
         (
-            dc.read_storage(path, update=True, session=test_session, delta=True)
+            dc.read_storage(
+                path,
+                update=True,
+                session=test_session,
+                delta=True,
+                delta_on=["file.source", "file.path"],
+                delta_compare=["file.version", "file.etag"],
+            )
             .map(index=get_index)
             .save(ds_name)
         )
@@ -217,7 +235,14 @@ def get_index(file: File) -> int:
             return int(re.search(r, file.path).group(1))  # type: ignore[union-attr]
 
         (
-            dc.read_storage(path, update=True, session=test_session, delta=True)
+            dc.read_storage(
+                path,
+                update=True,
+                session=test_session,
+                delta=True,
+                delta_on=["file.source", "file.path"],
+                delta_compare=["file.version", "file.etag"],
+            )
             .filter(C("file.path").glob("*.jpg"))
             .map(index=get_index)
             .filter(C("index") > 5)
@@ -242,24 +267,6 @@ def get_index(file: File) -> int:
     assert str(exc_info.value) == f"Dataset {ds_name} does not have version 2"
 
 
-def test_delta_update_no_file_signals(test_session):
-    starting_ds_name = "starting_ds"
-
-    dc.read_values(num=[10, 20], session=test_session).save(starting_ds_name)
-
-    with pytest.raises(ValueError) as excinfo:
-        dc.read_dataset(
-            starting_ds_name,
-            session=test_session,
-            delta=True,
-        ).save("delta_ds")
-
-    assert (
-        str(excinfo.value)
-        == "Chain doesn't produce file signal, cannot do delta update"
-    )
-
-
 @pytest.fixture
 def file_dataset(test_session):
     return dc.read_values(
@@ -276,9 +283,12 @@ def test_delta_update_union(test_session, file_dataset):
 
     with pytest.raises(NotImplementedError) as excinfo:
         (
-            dc.read_dataset(file_dataset.name, session=test_session, delta=True).union(
-                dc.read_dataset("numbers"), session=test_session
-            )
+            dc.read_dataset(
+                file_dataset.name,
+                session=test_session,
+                delta=True,
+                delta_on=["file.source", "file.path"],
+            ).union(dc.read_dataset("numbers"), session=test_session)
         )
 
     assert str(excinfo.value) == "Delta update cannot be used with union"
@@ -289,9 +299,12 @@ def test_delta_update_merge(test_session, file_dataset):
 
     with pytest.raises(NotImplementedError) as excinfo:
         (
-            dc.read_dataset(file_dataset.name, session=test_session, delta=True).merge(
-                dc.read_dataset("numbers"), on="id", session=test_session
-            )
+            dc.read_dataset(
+                file_dataset.name,
+                session=test_session,
+                delta=True,
+                delta_on=["file.source", "file.path"],
+            ).merge(dc.read_dataset("numbers"), on="id", session=test_session)
         )
 
     assert str(excinfo.value) == "Delta update cannot be used with merge"
@@ -301,7 +314,10 @@ def test_delta_update_distinct(test_session, file_dataset):
     with pytest.raises(NotImplementedError) as excinfo:
         (
             dc.read_dataset(
-                file_dataset.name, session=test_session, delta=True
+                file_dataset.name,
+                session=test_session,
+                delta=True,
+                delta_on=["file.source", "file.path"],
             ).distinct("file.path")
         )
 
@@ -312,7 +328,10 @@ def test_delta_update_group_by(test_session, file_dataset):
     with pytest.raises(NotImplementedError) as excinfo:
         (
             dc.read_dataset(
-                file_dataset.name, session=test_session, delta=True
+                file_dataset.name,
+                session=test_session,
+                delta=True,
+                delta_on=["file.source", "file.path"],
             ).group_by(cnt=func.count(), partition_by="file.path")
         )
 
@@ -322,9 +341,12 @@ def test_delta_update_group_by(test_session, file_dataset):
 def test_delta_update_agg(test_session, file_dataset):
     with pytest.raises(NotImplementedError) as excinfo:
         (
-            dc.read_dataset(file_dataset.name, session=test_session, delta=True).agg(
-                cnt=func.count(), partition_by="file.path"
-            )
+            dc.read_dataset(
+                file_dataset.name,
+                session=test_session,
+                delta=True,
+                delta_on=["file.source", "file.path"],
+            ).agg(cnt=func.count(), partition_by="file.path")
         )
 
     assert str(excinfo.value) == "Delta update cannot be used with agg"

From e1f60c79c41894774410a72af47e4caca02db76c Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Mon, 12 May 2025 10:36:42 +0200
Subject: [PATCH 42/45] fixing semver

---
 src/datachain/delta.py   |  2 +-
 tests/func/test_delta.py | 28 +++++++++++++++++++---------
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/src/datachain/delta.py b/src/datachain/delta.py
index 05c03505d..3772af783 100644
--- a/src/datachain/delta.py
+++ b/src/datachain/delta.py
@@ -82,7 +82,7 @@ def delta_update(
         return None, True
 
     source_ds_name = dep.name
-    source_ds_version = int(dep.version)
+    source_ds_version = dep.version
     source_ds_latest_version = catalog.get_dataset(source_ds_name).latest_version
 
     source_dc = datachain.read_dataset(source_ds_name, source_ds_version)
diff --git a/tests/func/test_delta.py b/tests/func/test_delta.py
index 9af749c10..81199da7c 100644
--- a/tests/func/test_delta.py
+++ b/tests/func/test_delta.py
@@ -50,14 +50,18 @@ def create_delta_dataset(ds_name):
     create_delta_dataset(ds_name)
 
     assert list(
-        dc.read_dataset(ds_name, version=1).order_by("file.path").collect("file.path")
+        dc.read_dataset(ds_name, version="1.0.0")
+        .order_by("file.path")
+        .collect("file.path")
     ) == [
         "img1.jpg",
         "img2.jpg",
     ]
 
     assert list(
-        dc.read_dataset(ds_name, version=2).order_by("file.path").collect("file.path")
+        dc.read_dataset(ds_name, version="1.0.1")
+        .order_by("file.path")
+        .collect("file.path")
     ) == [
         "img1.jpg",
         "img2.jpg",
@@ -116,7 +120,7 @@ def get_index(file: File) -> int:
     # into consideration on delta update
     etags = {
         r[0]: r[1].etag
-        for r in dc.read_dataset(ds_name, version=1).collect("index", "file")
+        for r in dc.read_dataset(ds_name, version="1.0.0").collect("index", "file")
     }
 
     # remove last couple of images to simulate modification since we will re-create it
@@ -131,7 +135,9 @@ def get_index(file: File) -> int:
     create_delta_dataset()
 
     assert list(
-        dc.read_dataset(ds_name, version=1).order_by("file.path").collect("file.path")
+        dc.read_dataset(ds_name, version="1.0.0")
+        .order_by("file.path")
+        .collect("file.path")
     ) == [
         "images/img4.jpg",
         "images/img6.jpg",
@@ -139,7 +145,9 @@ def get_index(file: File) -> int:
     ]
 
     assert list(
-        dc.read_dataset(ds_name, version=2).order_by("file.path").collect("file.path")
+        dc.read_dataset(ds_name, version="1.0.1")
+        .order_by("file.path")
+        .collect("file.path")
     ) == [
         "images/img10.jpg",
         "images/img12.jpg",
@@ -155,7 +163,7 @@ def get_index(file: File) -> int:
     # and modified rows etags should be bigger than the old ones
     assert (
         next(
-            dc.read_dataset(ds_name, version=2)
+            dc.read_dataset(ds_name, version="1.0.1")
             .filter(C("index") == 6)
             .order_by("file.path", "file.etag")
             .collect("file.etag")
@@ -253,7 +261,9 @@ def get_index(file: File) -> int:
     create_delta_dataset()
 
     assert list(
-        dc.read_dataset(ds_name, version=1).order_by("file.path").collect("file.path")
+        dc.read_dataset(ds_name, version="1.0.0")
+        .order_by("file.path")
+        .collect("file.path")
     ) == [
         "images/img6.jpg",
         "images/img7.jpg",
@@ -262,9 +272,9 @@ def get_index(file: File) -> int:
     ]
 
     with pytest.raises(DatasetVersionNotFoundError) as exc_info:
-        dc.read_dataset(ds_name, version=2)
+        dc.read_dataset(ds_name, version="1.0.1")
 
-    assert str(exc_info.value) == f"Dataset {ds_name} does not have version 2"
+    assert str(exc_info.value) == f"Dataset {ds_name} does not have version 1.0.1"
 
 
 @pytest.fixture

From be704a202214900d1f9a55c91777850c15e5344c Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Mon, 12 May 2025 11:09:24 +0200
Subject: [PATCH 43/45] renamed field

---
 src/datachain/lib/dc/datachain.py |  6 +++---
 src/datachain/lib/dc/datasets.py  | 19 +++++++++++--------
 src/datachain/lib/dc/storage.py   | 18 +++++++++++-------
 3 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/src/datachain/lib/dc/datachain.py b/src/datachain/lib/dc/datachain.py
index 5725f3daf..ac5b25149 100644
--- a/src/datachain/lib/dc/datachain.py
+++ b/src/datachain/lib/dc/datachain.py
@@ -194,7 +194,7 @@ def _as_delta(
             raise ValueError("'delta on' fields must be defined")
         self._delta = True
         self._delta_on = on
-        self._delta_right_on = right_on
+        self._delta_result_on = right_on
         self._delta_compare = compare
         return self
 
@@ -291,7 +291,7 @@ def _evolve(
         if self.delta:
             chain = chain._as_delta(
                 on=self._delta_on,
-                right_on=self._delta_right_on,
+                right_on=self._delta_result_on,
                 compare=self._delta_compare,
             )
 
@@ -521,7 +521,7 @@ def save(  # type: ignore[override]
                 self,
                 name,
                 on=self._delta_on,
-                right_on=self._delta_right_on,
+                right_on=self._delta_result_on,
                 compare=self._delta_compare,
             )
 
diff --git a/src/datachain/lib/dc/datasets.py b/src/datachain/lib/dc/datasets.py
index 05d61ce22..d4a82e513 100644
--- a/src/datachain/lib/dc/datasets.py
+++ b/src/datachain/lib/dc/datasets.py
@@ -28,11 +28,10 @@ def read_dataset(
     session: Optional[Session] = None,
     settings: Optional[dict] = None,
     fallback_to_studio: bool = True,
-    delta: bool = False,
+    delta: Optional[bool] = False,
     delta_on: Optional[Union[str, Sequence[str]]] = None,
-    delta_right_on: Optional[Union[str, Sequence[str]]] = None,
+    delta_result_on: Optional[Union[str, Sequence[str]]] = None,
     delta_compare: Optional[Union[str, Sequence[str]]] = None,
-    # delta_right_compare: Optional[Union[str, Sequence[str]]] = None,
 ) -> "DataChain":
     """Get data from a saved Dataset. It returns the chain itself.
     If dataset or version is not found locally, it will try to pull it from Studio.
@@ -63,10 +62,14 @@ def read_dataset(
             If two rows have the same values, they are considered the same (e.g., they
             could be different versions of the same row in a versioned source).
             This is used in the delta update to calculate the diff.
-        delta_right_on: A list of fields in the final dataset that correspond to the
-            `delta_on` fields if they were renamed.
-            There is no need to define this if the fields from `delta_on` are present
-            in the final dataset.
+        delta_result_on: A list of fields in the resulting dataset that correspond
+            to the `delta_on` fields from the source.
+            This is needed to identify rows that have changed in the source but are
+            already present in the current version of the resulting dataset, in order
+            to avoid including outdated versions of those rows in the new dataset.
+            We retain only the latest versions of rows to prevent duplication.
+            There is no need to define this if the `delta_on` fields are present in
+            the final dataset and have not been renamed.
         delta_compare: A list of fields used to check if the same row has been modified
             in the new version of the source.
             If not defined, all fields except those defined in delta_on will be used.
@@ -148,7 +151,7 @@ def read_dataset(
     chain = DataChain(query, _settings, signals_schema)
     if delta:
         chain = chain._as_delta(
-            on=delta_on, right_on=delta_right_on, compare=delta_compare
+            on=delta_on, right_on=delta_result_on, compare=delta_compare
         )
     return chain
 
diff --git a/src/datachain/lib/dc/storage.py b/src/datachain/lib/dc/storage.py
index edf8c0427..91d37ce66 100644
--- a/src/datachain/lib/dc/storage.py
+++ b/src/datachain/lib/dc/storage.py
@@ -35,9 +35,9 @@ def read_storage(
     column: str = "file",
     update: bool = False,
     anon: bool = False,
-    delta: bool = False,
+    delta: Optional[bool] = False,
     delta_on: Optional[Union[str, Sequence[str]]] = None,
-    delta_right_on: Optional[Union[str, Sequence[str]]] = None,
+    delta_result_on: Optional[Union[str, Sequence[str]]] = None,
     delta_compare: Optional[Union[str, Sequence[str]]] = None,
     client_config: Optional[dict] = None,
 ) -> "DataChain":
@@ -73,10 +73,14 @@ def read_storage(
             If two rows have the same values, they are considered the same (e.g., they
             could be different versions of the same row in a versioned source).
             This is used in the delta update to calculate the diff.
-        delta_right_on: A list of fields in the final dataset that correspond to the
-            `delta_on` fields if they were renamed.
-            There is no need to define this if the fields from `delta_on` are present
-            in the final dataset.
+        delta_result_on: A list of fields in the resulting dataset that correspond
+            to the `delta_on` fields from the source.
+            This is needed to identify rows that have changed in the source but are
+            already present in the current version of the resulting dataset, in order
+            to avoid including outdated versions of those rows in the new dataset.
+            We retain only the latest versions of rows to prevent duplication.
+            There is no need to define this if the `delta_on` fields are present in
+            the final dataset and have not been renamed.
         delta_compare: A list of fields used to check if the same row has been modified
             in the new version of the source.
             If not defined, all fields except those defined in delta_on will be used.
@@ -210,6 +214,6 @@ def lst_fn(ds_name, lst_uri):
 
     if delta:
         storage_chain = storage_chain._as_delta(
-            on=delta_on, right_on=delta_right_on, compare=delta_compare
+            on=delta_on, right_on=delta_result_on, compare=delta_compare
         )
     return storage_chain

From 5decfeb576b142e31c24008542e6a44e9f27aed0 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Tue, 13 May 2025 17:29:41 +0200
Subject: [PATCH 44/45] fixing dataset dependencies in delta update

---
 src/datachain/delta.py            | 23 ++++++++++++++---------
 src/datachain/lib/dc/datachain.py |  8 ++++++--
 src/datachain/query/dataset.py    |  6 +++++-
 tests/func/test_delta.py          | 17 +++++++++++++++++
 4 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/src/datachain/delta.py b/src/datachain/delta.py
index 3772af783..4293e03c4 100644
--- a/src/datachain/delta.py
+++ b/src/datachain/delta.py
@@ -1,8 +1,10 @@
 from collections.abc import Sequence
+from copy import copy
 from functools import wraps
 from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
 
 import datachain
+from datachain.dataset import DatasetDependency
 from datachain.error import DatasetNotFoundError
 
 if TYPE_CHECKING:
@@ -52,7 +54,7 @@ def delta_update(
     on: Union[str, Sequence[str]],
     right_on: Optional[Union[str, Sequence[str]]] = None,
     compare: Optional[Union[str, Sequence[str]]] = None,
-) -> tuple[Optional["DataChain"], bool]:
+) -> tuple[Optional["DataChain"], Optional[list[DatasetDependency]], bool]:
     """
     Creates new chain that consists of the last version of current delta dataset
     plus diff from the source with all needed modifications.
@@ -69,7 +71,7 @@ def delta_update(
         latest_version = catalog.get_dataset(name).latest_version
     except DatasetNotFoundError:
         # first creation of delta update dataset
-        return None, True
+        return None, None, True
 
     dependencies = catalog.get_dataset_dependencies(
         name, latest_version, indirect=False
@@ -79,11 +81,14 @@ def delta_update(
     if not dep:
         # starting dataset (e.g listing) was removed so we are backing off to normal
         # dataset creation, as it was created first time
-        return None, True
+        return None, None, True
 
     source_ds_name = dep.name
     source_ds_version = dep.version
     source_ds_latest_version = catalog.get_dataset(source_ds_name).latest_version
+    dependencies = copy(dependencies)
+    dependencies = [d for d in dependencies if d is not None]  # filter out removed dep
+    dependencies[0].version = source_ds_latest_version  # type: ignore[union-attr]
 
     source_dc = datachain.read_dataset(source_ds_name, source_ds_version)
     source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version)
@@ -96,18 +101,18 @@ def delta_update(
     diff = diff.persist()
 
     if diff.empty:
-        return None, False
+        return None, None, False
 
     # merging diff and the latest version of dataset
-    return (
+    delta_chain = (
         datachain.read_dataset(name, latest_version)
         .compare(
             diff,
-            on=on,
-            compare=compare,
-            right_on=right_on,
+            on=right_on or on,
             added=True,
             modified=False,
         )
         .union(diff)
-    ), True
+    )
+
+    return delta_chain, dependencies, True  # type: ignore[return-value]
diff --git a/src/datachain/lib/dc/datachain.py b/src/datachain/lib/dc/datachain.py
index ac5b25149..668f19155 100644
--- a/src/datachain/lib/dc/datachain.py
+++ b/src/datachain/lib/dc/datachain.py
@@ -517,7 +517,7 @@ def save(  # type: ignore[override]
 
         schema = self.signals_schema.clone_without_sys_signals().serialize()
         if self.delta and name:
-            delta_ds, has_changes = delta_update(
+            delta_ds, dependencies, has_changes = delta_update(
                 self,
                 name,
                 on=self._delta_on,
@@ -528,7 +528,11 @@ def save(  # type: ignore[override]
             if delta_ds:
                 return self._evolve(
                     query=delta_ds._query.save(
-                        name=name, version=version, feature_schema=schema, **kwargs
+                        name=name,
+                        version=version,
+                        feature_schema=schema,
+                        dependencies=dependencies,
+                        **kwargs,
                     )
                 )
 
diff --git a/src/datachain/query/dataset.py b/src/datachain/query/dataset.py
index 3bddee636..10194b3c4 100644
--- a/src/datachain/query/dataset.py
+++ b/src/datachain/query/dataset.py
@@ -41,7 +41,7 @@
     partition_col_names,
     partition_columns,
 )
-from datachain.dataset import DATASET_PREFIX, DatasetStatus, RowDict
+from datachain.dataset import DATASET_PREFIX, DatasetDependency, DatasetStatus, RowDict
 from datachain.error import DatasetNotFoundError, QueryScriptCancelError
 from datachain.func.base import Function
 from datachain.lib.listing import is_listing_dataset, listing_dataset_expired
@@ -1698,6 +1698,7 @@ def save(
         name: Optional[str] = None,
         version: Optional[str] = None,
         feature_schema: Optional[dict] = None,
+        dependencies: Optional[list[DatasetDependency]] = None,
         description: Optional[str] = None,
         attrs: Optional[list[str]] = None,
         **kwargs,
@@ -1751,6 +1752,9 @@ def save(
             )
             self.catalog.update_dataset_version_with_warehouse_info(dataset, version)
 
+            if dependencies:
+                # overriding dependencies
+                self.dependencies = {(dep.name, dep.version) for dep in dependencies}
             self._add_dependencies(dataset, version)  # type: ignore [arg-type]
         finally:
             self.cleanup()
diff --git a/tests/func/test_delta.py b/tests/func/test_delta.py
index 81199da7c..80c013e63 100644
--- a/tests/func/test_delta.py
+++ b/tests/func/test_delta.py
@@ -11,7 +11,17 @@
 from datachain.lib.file import File, ImageFile
 
 
+def _get_dependencies(catalog, name, version) -> list[tuple[str, str]]:
+    return sorted(
+        [
+            (d.name, d.version)
+            for d in catalog.get_dataset_dependencies(name, version, indirect=False)
+        ]
+    )
+
+
 def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path):
+    catalog = test_session.catalog
     starting_ds_name = "starting_ds"
     ds_name = "delta_ds"
 
@@ -37,6 +47,7 @@ def create_delta_dataset(ds_name):
             session=test_session,
             delta=True,
             delta_on=["file.source", "file.path"],
+            delta_result_on=["file.source", "file.path"],
             delta_compare=["file.version", "file.etag"],
         ).save(ds_name)
 
@@ -44,10 +55,12 @@ def create_delta_dataset(ds_name):
     create_image_dataset(starting_ds_name, images[:2])
     # first version of delta dataset
     create_delta_dataset(ds_name)
+    assert _get_dependencies(catalog, ds_name, "1.0.0") == [(starting_ds_name, "1.0.0")]
     # second version of starting dataset
     create_image_dataset(starting_ds_name, images[2:])
     # second version of delta dataset
     create_delta_dataset(ds_name)
+    assert _get_dependencies(catalog, ds_name, "1.0.1") == [(starting_ds_name, "1.0.1")]
 
     assert list(
         dc.read_dataset(ds_name, version="1.0.0")
@@ -69,6 +82,8 @@ def create_delta_dataset(ds_name):
         "img4.jpg",
     ]
 
+    create_delta_dataset(ds_name)
+
 
 def test_delta_update_from_storage(test_session, tmp_dir, tmp_path):
     ds_name = "delta_ds"
@@ -103,6 +118,7 @@ def get_index(file: File) -> int:
                 session=test_session,
                 delta=True,
                 delta_on=["file.source", "file.path"],
+                delta_result_on=["file.source", "file.path"],
                 delta_compare=["file.version", "file.etag"],
             )
             .filter(C("file.path").glob("*.jpg"))
@@ -204,6 +220,7 @@ def get_index(file: File) -> int:
                 session=test_session,
                 delta=True,
                 delta_on=["file.source", "file.path"],
+                delta_result_on=["file.source", "file.path"],
                 delta_compare=["file.version", "file.etag"],
             )
             .map(index=get_index)

From ab9f9a30e1abd8a95edd42002f77e2940db102d3 Mon Sep 17 00:00:00 2001
From: ilongin <ivan.longin1@gmail.com>
Date: Wed, 14 May 2025 16:44:47 +0200
Subject: [PATCH 45/45] fixing small issues with deleted

---
 src/datachain/delta.py          | 3 ++-
 src/datachain/lib/dc/storage.py | 2 +-
 tests/func/test_delta.py        | 4 ++++
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/datachain/delta.py b/src/datachain/delta.py
index 4293e03c4..22465c25c 100644
--- a/src/datachain/delta.py
+++ b/src/datachain/delta.py
@@ -93,7 +93,7 @@ def delta_update(
     source_dc = datachain.read_dataset(source_ds_name, source_ds_version)
     source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version)
 
-    diff = source_dc_latest.compare(source_dc, on=on, compare=compare)
+    diff = source_dc_latest.compare(source_dc, on=on, compare=compare, deleted=False)
     # We append all the steps from the original chain to diff, e.g filters, mappers.
     diff = _append_steps(diff, dc)
 
@@ -111,6 +111,7 @@ def delta_update(
             on=right_on or on,
             added=True,
             modified=False,
+            deleted=False,
         )
         .union(diff)
     )
diff --git a/src/datachain/lib/dc/storage.py b/src/datachain/lib/dc/storage.py
index 91d37ce66..827180e68 100644
--- a/src/datachain/lib/dc/storage.py
+++ b/src/datachain/lib/dc/storage.py
@@ -83,7 +83,7 @@ def read_storage(
             the final dataset and have not been renamed.
         delta_compare: A list of fields used to check if the same row has been modified
             in the new version of the source.
-            If not defined, all fields except those defined in delta_on will be used.
+            If not defined, all fields except those defined in `delta_on` will be used.
 
     Returns:
         DataChain: A DataChain object containing the file information.
diff --git a/tests/func/test_delta.py b/tests/func/test_delta.py
index 80c013e63..9d6525525 100644
--- a/tests/func/test_delta.py
+++ b/tests/func/test_delta.py
@@ -147,6 +147,10 @@ def get_index(file: File) -> int:
     for img in images[5:]:
         img["data"].save(tmp_dir / img["name"])
 
+    # remove first 5 images to check that deleted rows are not taken into consideration
+    for img in images[0:5]:
+        os.remove(tmp_dir / img["name"])
+
     # second version of delta dataset
     create_delta_dataset()