From fa09d0ba0707c96a5fda85659115095fd4339910 Mon Sep 17 00:00:00 2001 From: ilongin Date: Wed, 19 Feb 2025 14:21:15 +0100 Subject: [PATCH 01/45] adding incremental update --- src/datachain/lib/dc.py | 57 +++++++++++++++++++++++++++++++++++- tests/func/test_datachain.py | 41 ++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 1 deletion(-) diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py index 2b3429baf..0af60169f 100644 --- a/src/datachain/lib/dc.py +++ b/src/datachain/lib/dc.py @@ -411,6 +411,7 @@ def from_storage( object_name: str = "file", update: bool = False, anon: bool = False, + incremental: bool = False ) -> "Self": """Get data from a storage as a list of file with all file attributes. It returns the chain itself as usual. @@ -735,7 +736,11 @@ def listings( ) def save( # type: ignore[override] - self, name: Optional[str] = None, version: Optional[int] = None, **kwargs + self, + name: Optional[str] = None, + version: Optional[int] = None, + incremental: Optional[bool] = False, + **kwargs, ) -> "Self": """Save to a Dataset. It returns the chain itself. @@ -743,8 +748,58 @@ def save( # type: ignore[override] name : dataset name. Empty name saves to a temporary dataset that will be removed after process ends. Temp dataset are useful for optimization. version : version of a dataset. Default - the last version that exist. + incremental : wheather this is an incremental dataset or not. """ schema = self.signals_schema.clone_without_sys_signals().serialize() + if incremental and name: + """ + DataChain + .from_storage("s3://bkt/dir1/") + .filter(C("file.path").glob("*.jpg")) + .map(emb=my_embedding) + .save("incremental_ds") + + -> + DataChain + .from_storage("s3://bkt/dir1/") + .diff( + DataChain.from_dataset("incremental_ds", version=3), + on="file", # this should be get from ds feature schema + added=True, + modified=True, + ) + .filter(C("file.path").glob("*.jpg")) + .map(emb=my_embedding) + .save("incremental_ds") + + """ + from datachain.error import DatasetNotFoundError + try: + incremental_ds = self.session.catalog.get_dataset(name) + latest_version = incremental_ds.latest_version + diff = ( + DataChain.from_dataset( + self._query.starting_step.dataset_name, + version=self._query.starting_step.dataset_version + ) + .diff( + DataChain.from_dataset(name, version=latest_version), + on="file", # this should be get from ds feature schema + added=True, + modified=True, + ) + ) + diff._query.steps += self._query.steps + diff = diff.union(DataChain.from_dataset(name, latest_version)) + return self._evolve( + query=diff._query.save( + name=name, version=version, feature_schema=schema, **kwargs + ) + ) + except DatasetNotFoundError: + # dataset still doesn't exists so we continue with normal cration + pass + return self._evolve( query=self._query.save( name=name, version=version, feature_schema=schema, **kwargs diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py index a03f542d5..e7f6bd620 100644 --- a/tests/func/test_datachain.py +++ b/tests/func/test_datachain.py @@ -1788,3 +1788,44 @@ def func(key: str) -> str: for _ in range(4): with pytest.raises(Exception, match="Test Error!"): dc.map(res=func).exec() + + +def test_incremental_update(test_session, tmp_dir, tmp_path): + ds_name = "incremental_ds" + images = [ + {"name": "img1.jpg", "data": Image.new(mode="RGB", size=(64, 64))}, + {"name": "img2.jpg", "data": Image.new(mode="RGB", size=(128, 128))}, + ] + + for img in images: + img["data"].save(tmp_path / img["name"]) + + DataChain.from_values( + file=[ + ImageFile(path=img["name"], source=f"file://{tmp_path}") for img in images + ], + session=test_session, + ).save(ds_name, incremental=True) + + new_images = [ + {"name": "img3.jpg", "data": Image.new(mode="RGB", size=(64, 64))}, + {"name": "img4.jpg", "data": Image.new(mode="RGB", size=(128, 128))}, + ] + for img in new_images: + img["data"].save(tmp_path / img["name"]) + + images += new_images + + DataChain.from_values( + file=[ + ImageFile(path=img["name"], source=f"file://{tmp_path}") for img in images + ], + session=test_session, + ).save(ds_name, incremental=True) + + for im in dc.collect("file"): + print(im.path) + + assert 1 == 2 + + From 99a532746297de94bc9d114f3441f7efd927deec Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 19 Feb 2025 15:12:57 +0100 Subject: [PATCH 02/45] continued working on incremental --- src/datachain/lib/dc.py | 4 +++- tests/func/test_datachain.py | 19 +++++++++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py index 0af60169f..8f3d9dc9e 100644 --- a/src/datachain/lib/dc.py +++ b/src/datachain/lib/dc.py @@ -777,6 +777,8 @@ def save( # type: ignore[override] try: incremental_ds = self.session.catalog.get_dataset(name) latest_version = incremental_ds.latest_version + print(f"Starting ds is {self._query.starting_step.dataset_name}") + print(f"Starting ds version is {self._query.starting_step.dataset_version}") diff = ( DataChain.from_dataset( self._query.starting_step.dataset_name, @@ -797,7 +799,7 @@ def save( # type: ignore[override] ) ) except DatasetNotFoundError: - # dataset still doesn't exists so we continue with normal cration + # dataset doesn't exist yet so we can continue with normal cration pass return self._evolve( diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py index e7f6bd620..809050ed1 100644 --- a/tests/func/test_datachain.py +++ b/tests/func/test_datachain.py @@ -1791,6 +1791,7 @@ def func(key: str) -> str: def test_incremental_update(test_session, tmp_dir, tmp_path): + starting_ds_name = "starting_ds" ds_name = "incremental_ds" images = [ {"name": "img1.jpg", "data": Image.new(mode="RGB", size=(64, 64))}, @@ -1805,6 +1806,10 @@ def test_incremental_update(test_session, tmp_dir, tmp_path): ImageFile(path=img["name"], source=f"file://{tmp_path}") for img in images ], session=test_session, + ).save(starting_ds_name) + + DataChain.from_dataset( + starting_ds_name, session=test_session, ).save(ds_name, incremental=True) new_images = [ @@ -1815,15 +1820,25 @@ def test_incremental_update(test_session, tmp_dir, tmp_path): img["data"].save(tmp_path / img["name"]) images += new_images - DataChain.from_values( file=[ ImageFile(path=img["name"], source=f"file://{tmp_path}") for img in images ], session=test_session, + ).save(starting_ds_name) + + DataChain.from_dataset( + starting_ds_name, session=test_session, ).save(ds_name, incremental=True) - for im in dc.collect("file"): + dc = DataChain.from_dataset(ds_name) + + print("Images in version 1 are") + for im in DataChain.from_dataset(ds_name, version=1).collect("file"): + print(im.path) + + print("Images in version 2 are") + for im in DataChain.from_dataset(ds_name, version=2).collect("file"): print(im.path) assert 1 == 2 From f01b3a2b11c0c1d3216a09c3c163d1c97c6554f9 Mon Sep 17 00:00:00 2001 From: ilongin Date: Wed, 19 Feb 2025 16:11:57 +0100 Subject: [PATCH 03/45] finixhed first test --- tests/func/test_datachain.py | 77 +++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 37 deletions(-) diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py index 809050ed1..1cd0d2628 100644 --- a/tests/func/test_datachain.py +++ b/tests/func/test_datachain.py @@ -1790,57 +1790,60 @@ def func(key: str) -> str: dc.map(res=func).exec() -def test_incremental_update(test_session, tmp_dir, tmp_path): +def test_incremental_update_from_dataset(test_session, tmp_dir, tmp_path): starting_ds_name = "starting_ds" ds_name = "incremental_ds" + images = [ {"name": "img1.jpg", "data": Image.new(mode="RGB", size=(64, 64))}, {"name": "img2.jpg", "data": Image.new(mode="RGB", size=(128, 128))}, + {"name": "img3.jpg", "data": Image.new(mode="RGB", size=(64, 64))}, + {"name": "img4.jpg", "data": Image.new(mode="RGB", size=(128, 128))}, ] - for img in images: - img["data"].save(tmp_path / img["name"]) + def create_image_dataset(ds_name, images): + DataChain.from_values( + file=[ + ImageFile(path=img["name"], source=f"file://{tmp_path}") + for img in images + ], + session=test_session, + ).save(ds_name) - DataChain.from_values( - file=[ - ImageFile(path=img["name"], source=f"file://{tmp_path}") for img in images - ], - session=test_session, - ).save(starting_ds_name) + # first version of starting dataset + create_image_dataset(starting_ds_name, images[:2]) + # first version of incremental dataset DataChain.from_dataset( - starting_ds_name, session=test_session, + starting_ds_name, + session=test_session, ).save(ds_name, incremental=True) - new_images = [ - {"name": "img3.jpg", "data": Image.new(mode="RGB", size=(64, 64))}, - {"name": "img4.jpg", "data": Image.new(mode="RGB", size=(128, 128))}, - ] - for img in new_images: - img["data"].save(tmp_path / img["name"]) - - images += new_images - DataChain.from_values( - file=[ - ImageFile(path=img["name"], source=f"file://{tmp_path}") for img in images - ], - session=test_session, - ).save(starting_ds_name) + # second version of starting dataset + create_image_dataset(starting_ds_name, images[2:]) + # second version of incremental dataset DataChain.from_dataset( - starting_ds_name, session=test_session, + starting_ds_name, + session=test_session, ).save(ds_name, incremental=True) - dc = DataChain.from_dataset(ds_name) - - print("Images in version 1 are") - for im in DataChain.from_dataset(ds_name, version=1).collect("file"): - print(im.path) - - print("Images in version 2 are") - for im in DataChain.from_dataset(ds_name, version=2).collect("file"): - print(im.path) - - assert 1 == 2 - + assert list( + DataChain.from_dataset(ds_name, version=1) + .order_by("file.path") + .collect("file.path") + ) == [ + "img1.jpg", + "img2.jpg", + ] + assert list( + DataChain.from_dataset(ds_name, version=2) + .order_by("file.path") + .collect("file.path") + ) == [ + "img1.jpg", + "img2.jpg", + "img3.jpg", + "img4.jpg", + ] From 8fa15344a1c447cb566cc0cae69ab0cd398e1571 Mon Sep 17 00:00:00 2001 From: ivan Date: Fri, 21 Feb 2025 03:20:10 +0100 Subject: [PATCH 04/45] added from storage incremental update test --- tests/func/test_datachain.py | 55 ++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py index 1cd0d2628..d996944d2 100644 --- a/tests/func/test_datachain.py +++ b/tests/func/test_datachain.py @@ -1847,3 +1847,58 @@ def create_image_dataset(ds_name, images): "img3.jpg", "img4.jpg", ] + + +def test_incremental_update_from_storage(test_session, tmp_dir, tmp_path): + ds_name = "incremental_ds" + images = [ + {"name": "img1.jpg", "data": Image.new(mode="RGB", size=(64, 64))}, + {"name": "img2.jpg", "data": Image.new(mode="RGB", size=(128, 128))}, + {"name": "img3.jpg", "data": Image.new(mode="RGB", size=(64, 64))}, + {"name": "img4.jpg", "data": Image.new(mode="RGB", size=(128, 128))}, + ] + path = tmp_dir.as_uri() + tmp_dir = tmp_dir / "images" + os.mkdir(tmp_dir) + + # save only 2 images + for img in images[:2]: + img["data"].save(tmp_dir / img["name"]) + + # first version of incremental dataset + DataChain.from_storage( + path, + update=True, + session=test_session, + ).save(ds_name, incremental=True) + + # save other 2 images as well + for img in images[2:]: + img["data"].save(tmp_dir / img["name"]) + + # second version of incremental dataset + DataChain.from_storage( + path, + update=True, + session=test_session, + ).save(ds_name, incremental=True) + + assert list( + DataChain.from_dataset(ds_name, version=1) + .order_by("file.path") + .collect("file.path") + ) == [ + "images/img1.jpg", + "images/img2.jpg", + ] + + assert list( + DataChain.from_dataset(ds_name, version=2) + .order_by("file.path") + .collect("file.path") + ) == [ + "images/img1.jpg", + "images/img2.jpg", + "images/img3.jpg", + "images/img4.jpg", + ] From 67824e697be949355bd01252ee10aa46b8be553e Mon Sep 17 00:00:00 2001 From: ilongin Date: Fri, 21 Feb 2025 14:24:04 +0100 Subject: [PATCH 05/45] refactoring --- src/datachain/lib/dc.py | 57 ++++++++----------------- src/datachain/query/dataset.py | 15 +++---- tests/func/test_datachain.py | 78 +++++++++++++++++++--------------- 3 files changed, 67 insertions(+), 83 deletions(-) diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py index 8f3d9dc9e..0ff0b435b 100644 --- a/src/datachain/lib/dc.py +++ b/src/datachain/lib/dc.py @@ -25,6 +25,7 @@ from sqlalchemy.sql.sqltypes import NullType from datachain.dataset import DatasetRecord +from datachain.error import DatasetNotFoundError from datachain.func import literal from datachain.func.base import Function from datachain.func.func import Func @@ -411,7 +412,7 @@ def from_storage( object_name: str = "file", update: bool = False, anon: bool = False, - incremental: bool = False + incremental: bool = False, ) -> "Self": """Get data from a storage as a list of file with all file attributes. It returns the chain itself as usual. @@ -748,58 +749,36 @@ def save( # type: ignore[override] name : dataset name. Empty name saves to a temporary dataset that will be removed after process ends. Temp dataset are useful for optimization. version : version of a dataset. Default - the last version that exist. - incremental : wheather this is an incremental dataset or not. + incremental : whether this is an incremental dataset or not. """ schema = self.signals_schema.clone_without_sys_signals().serialize() if incremental and name: - """ - DataChain - .from_storage("s3://bkt/dir1/") - .filter(C("file.path").glob("*.jpg")) - .map(emb=my_embedding) - .save("incremental_ds") - - -> - DataChain - .from_storage("s3://bkt/dir1/") - .diff( - DataChain.from_dataset("incremental_ds", version=3), - on="file", # this should be get from ds feature schema + try: + latest_version = self.session.catalog.get_dataset(name).latest_version + source_ds_name = self._query.starting_step.dataset_name + source_ds_version = self._query.starting_step.dataset_version + diff = DataChain.from_dataset( + source_ds_name, version=source_ds_version + ).diff( + DataChain.from_dataset(name, version=latest_version), + on="file", # TODO this should be taken from ds feature schema added=True, modified=True, ) - .filter(C("file.path").glob("*.jpg")) - .map(emb=my_embedding) - .save("incremental_ds") - - """ - from datachain.error import DatasetNotFoundError - try: - incremental_ds = self.session.catalog.get_dataset(name) - latest_version = incremental_ds.latest_version - print(f"Starting ds is {self._query.starting_step.dataset_name}") - print(f"Starting ds version is {self._query.starting_step.dataset_version}") - diff = ( - DataChain.from_dataset( - self._query.starting_step.dataset_name, - version=self._query.starting_step.dataset_version - ) - .diff( - DataChain.from_dataset(name, version=latest_version), - on="file", # this should be get from ds feature schema - added=True, - modified=True, - ) - ) + # we append all the steps from original chain to diff dataset, + # e.g filters, mappers, mutates etc. diff._query.steps += self._query.steps + + # merging diff and latest version of our dataset chains diff = diff.union(DataChain.from_dataset(name, latest_version)) + return self._evolve( query=diff._query.save( name=name, version=version, feature_schema=schema, **kwargs ) ) except DatasetNotFoundError: - # dataset doesn't exist yet so we can continue with normal cration + # dataset doesn't exist yet so we can continue with normal flow pass return self._evolve( diff --git a/src/datachain/query/dataset.py b/src/datachain/query/dataset.py index 3b0eb420e..0093bcd0d 100644 --- a/src/datachain/query/dataset.py +++ b/src/datachain/query/dataset.py @@ -153,13 +153,6 @@ def step_result( ) -class StartingStep(ABC): - """An initial query processing step, referencing a data source.""" - - @abstractmethod - def apply(self) -> "StepResult": ... - - @frozen class Step(ABC): """A query processing step (filtering, mutation, etc.)""" @@ -172,12 +165,14 @@ def apply( @frozen -class QueryStep(StartingStep): +class QueryStep: + """A query that returns all rows from specific dataset version""" + catalog: "Catalog" dataset_name: str dataset_version: int - def apply(self): + def apply(self) -> "StepResult": def q(*columns): return sqlalchemy.select(*columns) @@ -1095,7 +1090,7 @@ def __init__( self.temp_table_names: list[str] = [] self.dependencies: set[DatasetDependencyType] = set() self.table = self.get_table() - self.starting_step: StartingStep + self.starting_step: QueryStep self.name: Optional[str] = None self.version: Optional[int] = None self.feature_schema: Optional[dict] = None diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py index d996944d2..7c3cc0bee 100644 --- a/tests/func/test_datachain.py +++ b/tests/func/test_datachain.py @@ -1810,23 +1810,20 @@ def create_image_dataset(ds_name, images): session=test_session, ).save(ds_name) + def create_incremental_dataset(ds_name): + DataChain.from_dataset( + starting_ds_name, + session=test_session, + ).save(ds_name, incremental=True) + # first version of starting dataset create_image_dataset(starting_ds_name, images[:2]) - # first version of incremental dataset - DataChain.from_dataset( - starting_ds_name, - session=test_session, - ).save(ds_name, incremental=True) - + create_incremental_dataset(ds_name) # second version of starting dataset create_image_dataset(starting_ds_name, images[2:]) - # second version of incremental dataset - DataChain.from_dataset( - starting_ds_name, - session=test_session, - ).save(ds_name, incremental=True) + create_incremental_dataset(ds_name) assert list( DataChain.from_dataset(ds_name, version=1) @@ -1851,45 +1848,54 @@ def create_image_dataset(ds_name, images): def test_incremental_update_from_storage(test_session, tmp_dir, tmp_path): ds_name = "incremental_ds" - images = [ - {"name": "img1.jpg", "data": Image.new(mode="RGB", size=(64, 64))}, - {"name": "img2.jpg", "data": Image.new(mode="RGB", size=(128, 128))}, - {"name": "img3.jpg", "data": Image.new(mode="RGB", size=(64, 64))}, - {"name": "img4.jpg", "data": Image.new(mode="RGB", size=(128, 128))}, - ] path = tmp_dir.as_uri() tmp_dir = tmp_dir / "images" os.mkdir(tmp_dir) - # save only 2 images - for img in images[:2]: + images = [ + { + "name": f"img{i}.{'jpg' if i % 2 == 0 else 'png'}", + "data": Image.new(mode="RGB", size=((i + 1) * 10, (i + 1) * 10)), + } + for i in range(20) + ] + + # save only half of the images for now + for img in images[:10]: img["data"].save(tmp_dir / img["name"]) + def create_incremental_dataset(): + def my_embedding(file: File) -> list[float]: + return [0.5, 0.5] + + ( + DataChain.from_storage(path, update=True, session=test_session) + .filter(C("file.path").glob("*.jpg")) + .map(emb=my_embedding) + .mutate(dist=func.cosine_distance("emb", (0.1, 0.2))) + .filter(C("file.size") % 10 < 5) + .save(ds_name, incremental=True) + ) + # first version of incremental dataset - DataChain.from_storage( - path, - update=True, - session=test_session, - ).save(ds_name, incremental=True) + create_incremental_dataset() - # save other 2 images as well - for img in images[2:]: + # save other half of images as well + for img in images[10:]: img["data"].save(tmp_dir / img["name"]) # second version of incremental dataset - DataChain.from_storage( - path, - update=True, - session=test_session, - ).save(ds_name, incremental=True) + create_incremental_dataset() assert list( DataChain.from_dataset(ds_name, version=1) .order_by("file.path") .collect("file.path") ) == [ - "images/img1.jpg", + "images/img0.jpg", "images/img2.jpg", + "images/img4.jpg", + "images/img8.jpg", ] assert list( @@ -1897,8 +1903,12 @@ def test_incremental_update_from_storage(test_session, tmp_dir, tmp_path): .order_by("file.path") .collect("file.path") ) == [ - "images/img1.jpg", + "images/img0.jpg", + "images/img10.jpg", + "images/img12.jpg", + "images/img16.jpg", + "images/img18.jpg", "images/img2.jpg", - "images/img3.jpg", "images/img4.jpg", + "images/img8.jpg", ] From ee6640d54eda867d54757cd0e339c58d798cd15a Mon Sep 17 00:00:00 2001 From: ilongin Date: Mon, 24 Feb 2025 15:36:10 +0100 Subject: [PATCH 06/45] using delta instead of incremental --- src/datachain/lib/dc.py | 7 +++---- tests/func/test_datachain.py | 32 ++++++++++++++++---------------- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py index 599a1c708..5032ddb69 100644 --- a/src/datachain/lib/dc.py +++ b/src/datachain/lib/dc.py @@ -412,7 +412,6 @@ def from_storage( object_name: str = "file", update: bool = False, anon: bool = False, - incremental: bool = False, client_config: Optional[dict] = None, ) -> "Self": """Get data from a storage as a list of file with all file attributes. @@ -758,7 +757,7 @@ def save( # type: ignore[override] self, name: Optional[str] = None, version: Optional[int] = None, - incremental: Optional[bool] = False, + delta: Optional[bool] = False, **kwargs, ) -> "Self": """Save to a Dataset. It returns the chain itself. @@ -767,10 +766,10 @@ def save( # type: ignore[override] name : dataset name. Empty name saves to a temporary dataset that will be removed after process ends. Temp dataset are useful for optimization. version : version of a dataset. Default - the last version that exist. - incremental : whether this is an incremental dataset or not. + delta : whether this is an delta dataset or not. """ schema = self.signals_schema.clone_without_sys_signals().serialize() - if incremental and name: + if delta and name: try: latest_version = self.session.catalog.get_dataset(name).latest_version source_ds_name = self._query.starting_step.dataset_name diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py index 403d85b23..18ad7eee1 100644 --- a/tests/func/test_datachain.py +++ b/tests/func/test_datachain.py @@ -1800,9 +1800,9 @@ def func(key: str) -> str: dc.map(res=func).exec() -def test_incremental_update_from_dataset(test_session, tmp_dir, tmp_path): +def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path): starting_ds_name = "starting_ds" - ds_name = "incremental_ds" + ds_name = "delta_ds" images = [ {"name": "img1.jpg", "data": Image.new(mode="RGB", size=(64, 64))}, @@ -1820,20 +1820,20 @@ def create_image_dataset(ds_name, images): session=test_session, ).save(ds_name) - def create_incremental_dataset(ds_name): + def create_delta_dataset(ds_name): DataChain.from_dataset( starting_ds_name, session=test_session, - ).save(ds_name, incremental=True) + ).save(ds_name, delta=True) # first version of starting dataset create_image_dataset(starting_ds_name, images[:2]) - # first version of incremental dataset - create_incremental_dataset(ds_name) + # first version of delta dataset + create_delta_dataset(ds_name) # second version of starting dataset create_image_dataset(starting_ds_name, images[2:]) - # second version of incremental dataset - create_incremental_dataset(ds_name) + # second version of delta dataset + create_delta_dataset(ds_name) assert list( DataChain.from_dataset(ds_name, version=1) @@ -1856,8 +1856,8 @@ def create_incremental_dataset(ds_name): ] -def test_incremental_update_from_storage(test_session, tmp_dir, tmp_path): - ds_name = "incremental_ds" +def test_delta_update_from_storage(test_session, tmp_dir, tmp_path): + ds_name = "delta_ds" path = tmp_dir.as_uri() tmp_dir = tmp_dir / "images" os.mkdir(tmp_dir) @@ -1874,7 +1874,7 @@ def test_incremental_update_from_storage(test_session, tmp_dir, tmp_path): for img in images[:10]: img["data"].save(tmp_dir / img["name"]) - def create_incremental_dataset(): + def create_delta_dataset(): def my_embedding(file: File) -> list[float]: return [0.5, 0.5] @@ -1884,18 +1884,18 @@ def my_embedding(file: File) -> list[float]: .map(emb=my_embedding) .mutate(dist=func.cosine_distance("emb", (0.1, 0.2))) .filter(C("file.size") % 10 < 5) - .save(ds_name, incremental=True) + .save(ds_name, delta=True) ) - # first version of incremental dataset - create_incremental_dataset() + # first version of delta dataset + create_delta_dataset() # save other half of images as well for img in images[10:]: img["data"].save(tmp_dir / img["name"]) - # second version of incremental dataset - create_incremental_dataset() + # second version of delta dataset + create_delta_dataset() assert list( DataChain.from_dataset(ds_name, version=1) From 5e446b58022f8289780c4cd24a5c5ab135fa46de Mon Sep 17 00:00:00 2001 From: ilongin Date: Tue, 25 Feb 2025 00:27:31 +0100 Subject: [PATCH 07/45] added check for modification --- tests/func/test_datachain.py | 41 ++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py index 18ad7eee1..aa632057c 100644 --- a/tests/func/test_datachain.py +++ b/tests/func/test_datachain.py @@ -1878,20 +1878,36 @@ def create_delta_dataset(): def my_embedding(file: File) -> list[float]: return [0.5, 0.5] + def get_index(file: File) -> int: + r = r".+\/img(\d+)\.jpg" + return int(re.search(r, file.path).group(1)) # type: ignore[union-attr] + ( DataChain.from_storage(path, update=True, session=test_session) .filter(C("file.path").glob("*.jpg")) .map(emb=my_embedding) .mutate(dist=func.cosine_distance("emb", (0.1, 0.2))) - .filter(C("file.size") % 10 < 5) + .map(index=get_index) + .filter(C("index") > 3) .save(ds_name, delta=True) ) # first version of delta dataset create_delta_dataset() - # save other half of images as well - for img in images[10:]: + # remember old etags for later comparison to prove modified images are also taken + # into consideration on delta update + etags = { + r[0]: r[1].etag + for r in DataChain.from_dataset(ds_name, version=1).collect("index", "file") + } + + # remove last couple of images to simulate modification since we will re-create it + for img in images[5:10]: + os.remove(tmp_dir / img["name"]) + + # save other half of images and the ones that are removed above + for img in images[5:]: img["data"].save(tmp_dir / img["name"]) # second version of delta dataset @@ -1902,9 +1918,8 @@ def my_embedding(file: File) -> list[float]: .order_by("file.path") .collect("file.path") ) == [ - "images/img0.jpg", - "images/img2.jpg", "images/img4.jpg", + "images/img6.jpg", "images/img8.jpg", ] @@ -1913,12 +1928,24 @@ def my_embedding(file: File) -> list[float]: .order_by("file.path") .collect("file.path") ) == [ - "images/img0.jpg", "images/img10.jpg", "images/img12.jpg", + "images/img14.jpg", "images/img16.jpg", "images/img18.jpg", - "images/img2.jpg", "images/img4.jpg", + "images/img6.jpg", + "images/img6.jpg", + "images/img8.jpg", "images/img8.jpg", ] + + # check that we have both old and new version of those that are modified + rows = list( + DataChain.from_dataset(ds_name, version=2) + .filter(C("index") == 6) + .order_by("file.path", "file.etag") + .collect("file") + ) + assert rows[0].etag == etags[6] + assert rows[1].etag > etags[6] # new etag is bigger as it's the value of mtime From 71c3469f3130aeb57521746861412203c0a12ab8 Mon Sep 17 00:00:00 2001 From: ilongin Date: Tue, 25 Feb 2025 01:25:32 +0100 Subject: [PATCH 08/45] added another test --- tests/func/test_datachain.py | 50 ++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py index aa632057c..930be6ac9 100644 --- a/tests/func/test_datachain.py +++ b/tests/func/test_datachain.py @@ -1949,3 +1949,53 @@ def get_index(file: File) -> int: ) assert rows[0].etag == etags[6] assert rows[1].etag > etags[6] # new etag is bigger as it's the value of mtime + + +def test_delta_update_no_diff(test_session, tmp_dir, tmp_path): + ds_name = "delta_ds" + path = tmp_dir.as_uri() + tmp_dir = tmp_dir / "images" + os.mkdir(tmp_dir) + + images = [ + {"name": f"img{i}.jpg", "data": Image.new(mode="RGB", size=(64, 128))} + for i in range(10) + ] + + for img in images: + img["data"].save(tmp_dir / img["name"]) + + def create_delta_dataset(): + def get_index(file: File) -> int: + r = r".+\/img(\d+)\.jpg" + return int(re.search(r, file.path).group(1)) # type: ignore[union-attr] + + ( + DataChain.from_storage(path, update=True, session=test_session) + .filter(C("file.path").glob("*.jpg")) + .map(index=get_index) + .filter(C("index") > 5) + .save(ds_name, delta=True) + ) + + create_delta_dataset() + create_delta_dataset() + + assert ( + list( + DataChain.from_dataset(ds_name, version=1) + .order_by("file.path") + .collect("file.path") + ) + == list( + DataChain.from_dataset(ds_name, version=2) + .order_by("file.path") + .collect("file.path") + ) + == [ + "images/img6.jpg", + "images/img7.jpg", + "images/img8.jpg", + "images/img9.jpg", + ] + ) From 83366aa2b646cf4eb752aa55ede5668757beb519 Mon Sep 17 00:00:00 2001 From: ilongin Date: Tue, 25 Feb 2025 13:26:04 +0100 Subject: [PATCH 09/45] refactoring --- src/datachain/delta.py | 30 ++++++++++++++++++++++++++++ src/datachain/lib/dc.py | 29 ++++----------------------- src/datachain/lib/signal_schema.py | 11 +++++----- tests/func/test_datachain.py | 16 +++++++++++++++ tests/unit/lib/test_signal_schema.py | 5 +++++ 5 files changed, 60 insertions(+), 31 deletions(-) create mode 100644 src/datachain/delta.py diff --git a/src/datachain/delta.py b/src/datachain/delta.py new file mode 100644 index 000000000..809803836 --- /dev/null +++ b/src/datachain/delta.py @@ -0,0 +1,30 @@ +from typing import TYPE_CHECKING, Optional + +from datachain.error import DatasetNotFoundError + +if TYPE_CHECKING: + from datachain.lib.dc import DataChain + + +def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]: + from datachain.lib.dc import DataChain + + file_signal = dc.signals_schema.get_file_signal() + if not file_signal: + raise ValueError("Datasets without file signals cannot have delta updates") + try: + latest_version = dc.session.catalog.get_dataset(name).latest_version + except DatasetNotFoundError: + return None + + source_ds_name = dc._query.starting_step.dataset_name + source_ds_version = dc._query.starting_step.dataset_version + diff = DataChain.from_dataset(source_ds_name, version=source_ds_version).diff( + DataChain.from_dataset(name, version=latest_version), on=file_signal + ) + # we append all the steps from original chain to diff, + # e.g filters, mappers, generators etc. + diff._query.steps += dc._query.steps + + # merging diff and latest version of our dataset chains + return diff.union(DataChain.from_dataset(name, latest_version)) diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py index 5032ddb69..f5e0f0168 100644 --- a/src/datachain/lib/dc.py +++ b/src/datachain/lib/dc.py @@ -25,7 +25,7 @@ from sqlalchemy.sql.sqltypes import NullType from datachain.dataset import DatasetRecord -from datachain.error import DatasetNotFoundError +from datachain.delta import delta_update from datachain.func import literal from datachain.func.base import Function from datachain.func.func import Func @@ -770,34 +770,13 @@ def save( # type: ignore[override] """ schema = self.signals_schema.clone_without_sys_signals().serialize() if delta and name: - try: - latest_version = self.session.catalog.get_dataset(name).latest_version - source_ds_name = self._query.starting_step.dataset_name - source_ds_version = self._query.starting_step.dataset_version - diff = DataChain.from_dataset( - source_ds_name, version=source_ds_version - ).diff( - DataChain.from_dataset(name, version=latest_version), - on="file", # TODO this should be taken from ds feature schema - added=True, - modified=True, - ) - # we append all the steps from original chain to diff dataset, - # e.g filters, mappers, mutates etc. - diff._query.steps += self._query.steps - - # merging diff and latest version of our dataset chains - diff = diff.union(DataChain.from_dataset(name, latest_version)) - + delta_ds = delta_update(self, name) + if delta_ds: return self._evolve( - query=diff._query.save( + query=delta_ds._query.save( name=name, version=version, feature_schema=schema, **kwargs ) ) - except DatasetNotFoundError: - # dataset doesn't exist yet so we can continue with normal flow - pass - return self._evolve( query=self._query.save( name=name, version=version, feature_schema=schema, **kwargs diff --git a/src/datachain/lib/signal_schema.py b/src/datachain/lib/signal_schema.py index d723c5b8d..17c8a8b4b 100644 --- a/src/datachain/lib/signal_schema.py +++ b/src/datachain/lib/signal_schema.py @@ -410,14 +410,13 @@ def row_to_objs(self, row: Sequence[Any]) -> list[DataValue]: pos += 1 return objs - def contains_file(self) -> bool: - for type_ in self.values.values(): - if (fr := ModelStore.to_pydantic(type_)) is not None and issubclass( + def get_file_signal(self) -> Optional[str]: + for signal_name, signal_type in self.values.items(): + if (fr := ModelStore.to_pydantic(signal_type)) is not None and issubclass( fr, File ): - return True - - return False + return signal_name + return None def slice( self, keys: Sequence[str], setup: Optional[dict[str, Callable]] = None diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py index 930be6ac9..5ac3a7912 100644 --- a/tests/func/test_datachain.py +++ b/tests/func/test_datachain.py @@ -1999,3 +1999,19 @@ def get_index(file: File) -> int: "images/img9.jpg", ] ) + + +def test_delta_update_no_file_signals(test_session): + starting_ds_name = "starting_ds" + + DataChain.from_values(num=[10, 20], session=test_session).save(starting_ds_name) + + with pytest.raises(ValueError) as excinfo: + DataChain.from_dataset( + starting_ds_name, + session=test_session, + ).save("delta_ds", delta=True) + + assert ( + str(excinfo.value) == "Datasets without file signals cannot have delta updates" + ) diff --git a/tests/unit/lib/test_signal_schema.py b/tests/unit/lib/test_signal_schema.py index cef421b5d..dc00dbf9b 100644 --- a/tests/unit/lib/test_signal_schema.py +++ b/tests/unit/lib/test_signal_schema.py @@ -992,3 +992,8 @@ def test_column_types(column_type, signal_type): assert len(signals) == 1 assert signals["val"] is signal_type + + +def test_get_file_signal(): + assert SignalSchema({"name": str, "f": File}).get_file_signal() == "f" + assert SignalSchema({"name": str}).get_file_signal() is None From a22916c1c54172c65c536d5a53f3530f3b987816 Mon Sep 17 00:00:00 2001 From: ilongin Date: Tue, 25 Feb 2025 15:59:15 +0100 Subject: [PATCH 10/45] added comment --- src/datachain/delta.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/datachain/delta.py b/src/datachain/delta.py index 809803836..1b1406e3b 100644 --- a/src/datachain/delta.py +++ b/src/datachain/delta.py @@ -7,6 +7,13 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]: + """ + Creates new chain that consists of the last version of current delta dataset + plus diff from the source with all needed modifications. + This way we don't need to re-calculate the whole chain from the source again( + apply all the DataChain methods like filters, mappers, generators etc.) + but just the diff part which is very important for performance. + """ from datachain.lib.dc import DataChain file_signal = dc.signals_schema.get_file_signal() @@ -15,6 +22,7 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]: try: latest_version = dc.session.catalog.get_dataset(name).latest_version except DatasetNotFoundError: + # first creation of delta update dataset return None source_ds_name = dc._query.starting_step.dataset_name @@ -22,9 +30,10 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]: diff = DataChain.from_dataset(source_ds_name, version=source_ds_version).diff( DataChain.from_dataset(name, version=latest_version), on=file_signal ) - # we append all the steps from original chain to diff, - # e.g filters, mappers, generators etc. + # we append all the steps from the original chain to diff, + # e.g filters, mappers, generators etc. With this we make sure we add all + # needed modifications to diff part as well diff._query.steps += dc._query.steps - # merging diff and latest version of our dataset chains + # merging diff and the latest version of our dataset return diff.union(DataChain.from_dataset(name, latest_version)) From d9e4f26a54abb46dfe633facf4f86927899454e6 Mon Sep 17 00:00:00 2001 From: ilongin Date: Tue, 25 Feb 2025 16:06:42 +0100 Subject: [PATCH 11/45] split tests in new file --- tests/func/test_datachain.py | 217 --------------------------------- tests/func/test_delta.py | 226 +++++++++++++++++++++++++++++++++++ 2 files changed, 226 insertions(+), 217 deletions(-) create mode 100644 tests/func/test_delta.py diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py index 5ac3a7912..8e4599911 100644 --- a/tests/func/test_datachain.py +++ b/tests/func/test_datachain.py @@ -1798,220 +1798,3 @@ def func(key: str) -> str: for _ in range(4): with pytest.raises(Exception, match="Test Error!"): dc.map(res=func).exec() - - -def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path): - starting_ds_name = "starting_ds" - ds_name = "delta_ds" - - images = [ - {"name": "img1.jpg", "data": Image.new(mode="RGB", size=(64, 64))}, - {"name": "img2.jpg", "data": Image.new(mode="RGB", size=(128, 128))}, - {"name": "img3.jpg", "data": Image.new(mode="RGB", size=(64, 64))}, - {"name": "img4.jpg", "data": Image.new(mode="RGB", size=(128, 128))}, - ] - - def create_image_dataset(ds_name, images): - DataChain.from_values( - file=[ - ImageFile(path=img["name"], source=f"file://{tmp_path}") - for img in images - ], - session=test_session, - ).save(ds_name) - - def create_delta_dataset(ds_name): - DataChain.from_dataset( - starting_ds_name, - session=test_session, - ).save(ds_name, delta=True) - - # first version of starting dataset - create_image_dataset(starting_ds_name, images[:2]) - # first version of delta dataset - create_delta_dataset(ds_name) - # second version of starting dataset - create_image_dataset(starting_ds_name, images[2:]) - # second version of delta dataset - create_delta_dataset(ds_name) - - assert list( - DataChain.from_dataset(ds_name, version=1) - .order_by("file.path") - .collect("file.path") - ) == [ - "img1.jpg", - "img2.jpg", - ] - - assert list( - DataChain.from_dataset(ds_name, version=2) - .order_by("file.path") - .collect("file.path") - ) == [ - "img1.jpg", - "img2.jpg", - "img3.jpg", - "img4.jpg", - ] - - -def test_delta_update_from_storage(test_session, tmp_dir, tmp_path): - ds_name = "delta_ds" - path = tmp_dir.as_uri() - tmp_dir = tmp_dir / "images" - os.mkdir(tmp_dir) - - images = [ - { - "name": f"img{i}.{'jpg' if i % 2 == 0 else 'png'}", - "data": Image.new(mode="RGB", size=((i + 1) * 10, (i + 1) * 10)), - } - for i in range(20) - ] - - # save only half of the images for now - for img in images[:10]: - img["data"].save(tmp_dir / img["name"]) - - def create_delta_dataset(): - def my_embedding(file: File) -> list[float]: - return [0.5, 0.5] - - def get_index(file: File) -> int: - r = r".+\/img(\d+)\.jpg" - return int(re.search(r, file.path).group(1)) # type: ignore[union-attr] - - ( - DataChain.from_storage(path, update=True, session=test_session) - .filter(C("file.path").glob("*.jpg")) - .map(emb=my_embedding) - .mutate(dist=func.cosine_distance("emb", (0.1, 0.2))) - .map(index=get_index) - .filter(C("index") > 3) - .save(ds_name, delta=True) - ) - - # first version of delta dataset - create_delta_dataset() - - # remember old etags for later comparison to prove modified images are also taken - # into consideration on delta update - etags = { - r[0]: r[1].etag - for r in DataChain.from_dataset(ds_name, version=1).collect("index", "file") - } - - # remove last couple of images to simulate modification since we will re-create it - for img in images[5:10]: - os.remove(tmp_dir / img["name"]) - - # save other half of images and the ones that are removed above - for img in images[5:]: - img["data"].save(tmp_dir / img["name"]) - - # second version of delta dataset - create_delta_dataset() - - assert list( - DataChain.from_dataset(ds_name, version=1) - .order_by("file.path") - .collect("file.path") - ) == [ - "images/img4.jpg", - "images/img6.jpg", - "images/img8.jpg", - ] - - assert list( - DataChain.from_dataset(ds_name, version=2) - .order_by("file.path") - .collect("file.path") - ) == [ - "images/img10.jpg", - "images/img12.jpg", - "images/img14.jpg", - "images/img16.jpg", - "images/img18.jpg", - "images/img4.jpg", - "images/img6.jpg", - "images/img6.jpg", - "images/img8.jpg", - "images/img8.jpg", - ] - - # check that we have both old and new version of those that are modified - rows = list( - DataChain.from_dataset(ds_name, version=2) - .filter(C("index") == 6) - .order_by("file.path", "file.etag") - .collect("file") - ) - assert rows[0].etag == etags[6] - assert rows[1].etag > etags[6] # new etag is bigger as it's the value of mtime - - -def test_delta_update_no_diff(test_session, tmp_dir, tmp_path): - ds_name = "delta_ds" - path = tmp_dir.as_uri() - tmp_dir = tmp_dir / "images" - os.mkdir(tmp_dir) - - images = [ - {"name": f"img{i}.jpg", "data": Image.new(mode="RGB", size=(64, 128))} - for i in range(10) - ] - - for img in images: - img["data"].save(tmp_dir / img["name"]) - - def create_delta_dataset(): - def get_index(file: File) -> int: - r = r".+\/img(\d+)\.jpg" - return int(re.search(r, file.path).group(1)) # type: ignore[union-attr] - - ( - DataChain.from_storage(path, update=True, session=test_session) - .filter(C("file.path").glob("*.jpg")) - .map(index=get_index) - .filter(C("index") > 5) - .save(ds_name, delta=True) - ) - - create_delta_dataset() - create_delta_dataset() - - assert ( - list( - DataChain.from_dataset(ds_name, version=1) - .order_by("file.path") - .collect("file.path") - ) - == list( - DataChain.from_dataset(ds_name, version=2) - .order_by("file.path") - .collect("file.path") - ) - == [ - "images/img6.jpg", - "images/img7.jpg", - "images/img8.jpg", - "images/img9.jpg", - ] - ) - - -def test_delta_update_no_file_signals(test_session): - starting_ds_name = "starting_ds" - - DataChain.from_values(num=[10, 20], session=test_session).save(starting_ds_name) - - with pytest.raises(ValueError) as excinfo: - DataChain.from_dataset( - starting_ds_name, - session=test_session, - ).save("delta_ds", delta=True) - - assert ( - str(excinfo.value) == "Datasets without file signals cannot have delta updates" - ) diff --git a/tests/func/test_delta.py b/tests/func/test_delta.py new file mode 100644 index 000000000..1b5f67784 --- /dev/null +++ b/tests/func/test_delta.py @@ -0,0 +1,226 @@ +import os + +import pytest +import regex as re +from PIL import Image + +from datachain import func +from datachain.lib.dc import C, DataChain +from datachain.lib.file import File, ImageFile + + +def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path): + starting_ds_name = "starting_ds" + ds_name = "delta_ds" + + images = [ + {"name": "img1.jpg", "data": Image.new(mode="RGB", size=(64, 64))}, + {"name": "img2.jpg", "data": Image.new(mode="RGB", size=(128, 128))}, + {"name": "img3.jpg", "data": Image.new(mode="RGB", size=(64, 64))}, + {"name": "img4.jpg", "data": Image.new(mode="RGB", size=(128, 128))}, + ] + + def create_image_dataset(ds_name, images): + DataChain.from_values( + file=[ + ImageFile(path=img["name"], source=f"file://{tmp_path}") + for img in images + ], + session=test_session, + ).save(ds_name) + + def create_delta_dataset(ds_name): + DataChain.from_dataset( + starting_ds_name, + session=test_session, + ).save(ds_name, delta=True) + + # first version of starting dataset + create_image_dataset(starting_ds_name, images[:2]) + # first version of delta dataset + create_delta_dataset(ds_name) + # second version of starting dataset + create_image_dataset(starting_ds_name, images[2:]) + # second version of delta dataset + create_delta_dataset(ds_name) + + assert list( + DataChain.from_dataset(ds_name, version=1) + .order_by("file.path") + .collect("file.path") + ) == [ + "img1.jpg", + "img2.jpg", + ] + + assert list( + DataChain.from_dataset(ds_name, version=2) + .order_by("file.path") + .collect("file.path") + ) == [ + "img1.jpg", + "img2.jpg", + "img3.jpg", + "img4.jpg", + ] + + +def test_delta_update_from_storage(test_session, tmp_dir, tmp_path): + ds_name = "delta_ds" + path = tmp_dir.as_uri() + tmp_dir = tmp_dir / "images" + os.mkdir(tmp_dir) + + images = [ + { + "name": f"img{i}.{'jpg' if i % 2 == 0 else 'png'}", + "data": Image.new(mode="RGB", size=((i + 1) * 10, (i + 1) * 10)), + } + for i in range(20) + ] + + # save only half of the images for now + for img in images[:10]: + img["data"].save(tmp_dir / img["name"]) + + def create_delta_dataset(): + def my_embedding(file: File) -> list[float]: + return [0.5, 0.5] + + def get_index(file: File) -> int: + r = r".+\/img(\d+)\.jpg" + return int(re.search(r, file.path).group(1)) # type: ignore[union-attr] + + ( + DataChain.from_storage(path, update=True, session=test_session) + .filter(C("file.path").glob("*.jpg")) + .map(emb=my_embedding) + .mutate(dist=func.cosine_distance("emb", (0.1, 0.2))) + .map(index=get_index) + .filter(C("index") > 3) + .save(ds_name, delta=True) + ) + + # first version of delta dataset + create_delta_dataset() + + # remember old etags for later comparison to prove modified images are also taken + # into consideration on delta update + etags = { + r[0]: r[1].etag + for r in DataChain.from_dataset(ds_name, version=1).collect("index", "file") + } + + # remove last couple of images to simulate modification since we will re-create it + for img in images[5:10]: + os.remove(tmp_dir / img["name"]) + + # save other half of images and the ones that are removed above + for img in images[5:]: + img["data"].save(tmp_dir / img["name"]) + + # second version of delta dataset + create_delta_dataset() + + assert list( + DataChain.from_dataset(ds_name, version=1) + .order_by("file.path") + .collect("file.path") + ) == [ + "images/img4.jpg", + "images/img6.jpg", + "images/img8.jpg", + ] + + assert list( + DataChain.from_dataset(ds_name, version=2) + .order_by("file.path") + .collect("file.path") + ) == [ + "images/img10.jpg", + "images/img12.jpg", + "images/img14.jpg", + "images/img16.jpg", + "images/img18.jpg", + "images/img4.jpg", + "images/img6.jpg", + "images/img6.jpg", + "images/img8.jpg", + "images/img8.jpg", + ] + + # check that we have both old and new version of those that are modified + rows = list( + DataChain.from_dataset(ds_name, version=2) + .filter(C("index") == 6) + .order_by("file.path", "file.etag") + .collect("file") + ) + assert rows[0].etag == etags[6] + assert rows[1].etag > etags[6] # new etag is bigger as it's the value of mtime + + +def test_delta_update_no_diff(test_session, tmp_dir, tmp_path): + ds_name = "delta_ds" + path = tmp_dir.as_uri() + tmp_dir = tmp_dir / "images" + os.mkdir(tmp_dir) + + images = [ + {"name": f"img{i}.jpg", "data": Image.new(mode="RGB", size=(64, 128))} + for i in range(10) + ] + + for img in images: + img["data"].save(tmp_dir / img["name"]) + + def create_delta_dataset(): + def get_index(file: File) -> int: + r = r".+\/img(\d+)\.jpg" + return int(re.search(r, file.path).group(1)) # type: ignore[union-attr] + + ( + DataChain.from_storage(path, update=True, session=test_session) + .filter(C("file.path").glob("*.jpg")) + .map(index=get_index) + .filter(C("index") > 5) + .save(ds_name, delta=True) + ) + + create_delta_dataset() + create_delta_dataset() + + assert ( + list( + DataChain.from_dataset(ds_name, version=1) + .order_by("file.path") + .collect("file.path") + ) + == list( + DataChain.from_dataset(ds_name, version=2) + .order_by("file.path") + .collect("file.path") + ) + == [ + "images/img6.jpg", + "images/img7.jpg", + "images/img8.jpg", + "images/img9.jpg", + ] + ) + + +def test_delta_update_no_file_signals(test_session): + starting_ds_name = "starting_ds" + + DataChain.from_values(num=[10, 20], session=test_session).save(starting_ds_name) + + with pytest.raises(ValueError) as excinfo: + DataChain.from_dataset( + starting_ds_name, + session=test_session, + ).save("delta_ds", delta=True) + + assert ( + str(excinfo.value) == "Datasets without file signals cannot have delta updates" + ) From 58c27f038fe8f1588842cafa891067945dad6dec Mon Sep 17 00:00:00 2001 From: ilongin Date: Tue, 25 Feb 2025 16:39:04 +0100 Subject: [PATCH 12/45] updated docs --- src/datachain/lib/dc.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py index f5e0f0168..5b4065bab 100644 --- a/src/datachain/lib/dc.py +++ b/src/datachain/lib/dc.py @@ -766,7 +766,11 @@ def save( # type: ignore[override] name : dataset name. Empty name saves to a temporary dataset that will be removed after process ends. Temp dataset are useful for optimization. version : version of a dataset. Default - the last version that exist. - delta : whether this is an delta dataset or not. + delta : If True, we optimize on creation of the new dataset versions + by calculating diff between source and the last version and applying + all needed modifications (mappers, filters etc.) only on that diff. + At the end, we merge modified diff with last version of dataset to + create new version. """ schema = self.signals_schema.clone_without_sys_signals().serialize() if delta and name: From 046731b7c354d571469e35306123015e44dc8563 Mon Sep 17 00:00:00 2001 From: ilongin Date: Wed, 5 Mar 2025 13:19:13 +0100 Subject: [PATCH 13/45] added sys columns explicitly --- src/datachain/diff/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/datachain/diff/__init__.py b/src/datachain/diff/__init__.py index b325a2d29..0485ee6d5 100644 --- a/src/datachain/diff/__init__.py +++ b/src/datachain/diff/__init__.py @@ -137,6 +137,7 @@ def _to_list(obj: Optional[Union[str, Sequence[str]]]) -> Optional[list[str]]: for c in [c for c in cols if c in right_cols] } ) + .settings(sys=True) .select_except(ldiff_col, rdiff_col) ) From 9f52c8b132d8e358722f60a46a06583b98cc7fe0 Mon Sep 17 00:00:00 2001 From: ilongin Date: Thu, 6 Mar 2025 00:58:17 +0100 Subject: [PATCH 14/45] fixing delta to not have old versions in end result --- src/datachain/delta.py | 6 +++++- tests/func/test_delta.py | 20 ++++++++++---------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/datachain/delta.py b/src/datachain/delta.py index 1b1406e3b..f3941f73e 100644 --- a/src/datachain/delta.py +++ b/src/datachain/delta.py @@ -36,4 +36,8 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]: diff._query.steps += dc._query.steps # merging diff and the latest version of our dataset - return diff.union(DataChain.from_dataset(name, latest_version)) + return ( + DataChain.from_dataset(name, latest_version) + .diff(diff, added=True, modified=False) + .union(diff) + ) diff --git a/tests/func/test_delta.py b/tests/func/test_delta.py index 1b5f67784..96cd9792e 100644 --- a/tests/func/test_delta.py +++ b/tests/func/test_delta.py @@ -144,20 +144,20 @@ def get_index(file: File) -> int: "images/img18.jpg", "images/img4.jpg", "images/img6.jpg", - "images/img6.jpg", - "images/img8.jpg", "images/img8.jpg", ] - # check that we have both old and new version of those that are modified - rows = list( - DataChain.from_dataset(ds_name, version=2) - .filter(C("index") == 6) - .order_by("file.path", "file.etag") - .collect("file") + # check that we have newest versions for modified rows since etags are mtime + # and modified rows etags should be bigger than the old ones + assert ( + next( + DataChain.from_dataset(ds_name, version=2) + .filter(C("index") == 6) + .order_by("file.path", "file.etag") + .collect("file.etag") + ) + > etags[6] ) - assert rows[0].etag == etags[6] - assert rows[1].etag > etags[6] # new etag is bigger as it's the value of mtime def test_delta_update_no_diff(test_session, tmp_dir, tmp_path): From 802a934919b692027f6c6d4228166b3ef384ba26 Mon Sep 17 00:00:00 2001 From: ilongin Date: Thu, 6 Mar 2025 01:48:51 +0100 Subject: [PATCH 15/45] added append steps --- src/datachain/delta.py | 2 +- src/datachain/lib/dc.py | 9 +++++++++ src/datachain/lib/signal_schema.py | 7 +++++++ tests/unit/lib/test_datachain.py | 18 ++++++++++++++++++ tests/unit/lib/test_signal_schema.py | 6 ++++++ 5 files changed, 41 insertions(+), 1 deletion(-) diff --git a/src/datachain/delta.py b/src/datachain/delta.py index f3941f73e..32ebdcbfb 100644 --- a/src/datachain/delta.py +++ b/src/datachain/delta.py @@ -33,7 +33,7 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]: # we append all the steps from the original chain to diff, # e.g filters, mappers, generators etc. With this we make sure we add all # needed modifications to diff part as well - diff._query.steps += dc._query.steps + diff = diff.append_steps(dc) # merging diff and the latest version of our dataset return ( diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py index 1aca56dca..840959a71 100644 --- a/src/datachain/lib/dc.py +++ b/src/datachain/lib/dc.py @@ -334,6 +334,15 @@ def clone(self) -> "Self": """Make a copy of the chain in a new table.""" return self._evolve(query=self._query.clone(new_table=True)) + def append_steps(self, chain: "DataChain") -> "Self": + """Returns cloned chain with appended steps from other chain. + Steps are all those modification methods applied like filters, mappers etc. + """ + dc = self.clone() + dc._query.steps += chain._query.steps + dc.signals_schema = dc.signals_schema.append(chain.signals_schema) + return dc + def _evolve( self, *, diff --git a/src/datachain/lib/signal_schema.py b/src/datachain/lib/signal_schema.py index e6654250c..c32bf10d7 100644 --- a/src/datachain/lib/signal_schema.py +++ b/src/datachain/lib/signal_schema.py @@ -646,6 +646,13 @@ def merge( return SignalSchema(self.values | schema_right) + def append(self, right: "SignalSchema") -> "SignalSchema": + missing_schema = { + key: right.values[key] + for key in [k for k in right.values if k not in self.values] + } + return SignalSchema(self.values | missing_schema) + def get_signals(self, target_type: type[DataModel]) -> Iterator[str]: for path, type_, has_subtree, _ in self.get_flat_tree(): if has_subtree and issubclass(type_, target_type): diff --git a/tests/unit/lib/test_datachain.py b/tests/unit/lib/test_datachain.py index d5b442edd..3a10e8616 100644 --- a/tests/unit/lib/test_datachain.py +++ b/tests/unit/lib/test_datachain.py @@ -2998,3 +2998,21 @@ def test_window_error(test_session): ), ): dc.mutate(first=func.sum("col2").over(window)) + + +def test_append_steps(test_session): + keys = ["a", "b", "c", "d"] + values = [1, 2, 3, 4] + + DataChain.from_values(key=keys, val=values, session=test_session).save("ds") + + ds1 = ( + DataChain.from_dataset("ds", session=test_session) + .filter(C("val") > 2) + .mutate(double=C("val") * 2) + ) + + ds2 = DataChain.from_dataset("ds", session=test_session).append_steps(ds1) + + assert list(ds2.order_by("val").collect("val")) == [3, 4] + assert list(ds2.order_by("val").collect("double")) == [6, 8] diff --git a/tests/unit/lib/test_signal_schema.py b/tests/unit/lib/test_signal_schema.py index 637194ae1..d03f52807 100644 --- a/tests/unit/lib/test_signal_schema.py +++ b/tests/unit/lib/test_signal_schema.py @@ -1137,3 +1137,9 @@ class Custom(DataModel): def test_get_file_signal(): assert SignalSchema({"name": str, "f": File}).get_file_signal() == "f" assert SignalSchema({"name": str}).get_file_signal() is None + + +def test_append(): + s1 = SignalSchema({"name": str, "f": File}) + s2 = SignalSchema({"name": str, "f": File, "age": int}) + assert s1.append(s2).values == {"name": str, "f": File, "age": int} From f3a7b128970e1bc55a0ecf7cd2cf4bcc36ee9ffd Mon Sep 17 00:00:00 2001 From: ilongin Date: Thu, 6 Mar 2025 13:14:39 +0100 Subject: [PATCH 16/45] fixing logic --- src/datachain/delta.py | 20 ++++++++++++-------- src/datachain/diff/__init__.py | 6 +++++- src/datachain/lib/dc.py | 8 +++++++- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/src/datachain/delta.py b/src/datachain/delta.py index 32ebdcbfb..b525252ca 100644 --- a/src/datachain/delta.py +++ b/src/datachain/delta.py @@ -27,17 +27,21 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]: source_ds_name = dc._query.starting_step.dataset_name source_ds_version = dc._query.starting_step.dataset_version - diff = DataChain.from_dataset(source_ds_name, version=source_ds_version).diff( - DataChain.from_dataset(name, version=latest_version), on=file_signal + + diff = ( + DataChain.from_dataset(source_ds_name, version=source_ds_version) + .diff( + DataChain.from_dataset(name, version=latest_version), + on=file_signal, + sys=True, + ) + # We append all the steps from the original chain to diff, e.g filters, mappers. + .append_steps(dc) ) - # we append all the steps from the original chain to diff, - # e.g filters, mappers, generators etc. With this we make sure we add all - # needed modifications to diff part as well - diff = diff.append_steps(dc) - # merging diff and the latest version of our dataset + # merging diff and the latest version of dataset return ( DataChain.from_dataset(name, latest_version) - .diff(diff, added=True, modified=False) + .diff(diff, added=True, modified=False, sys=True) .union(diff) ) diff --git a/src/datachain/diff/__init__.py b/src/datachain/diff/__init__.py index 0485ee6d5..d09931851 100644 --- a/src/datachain/diff/__init__.py +++ b/src/datachain/diff/__init__.py @@ -42,6 +42,7 @@ def _compare( # noqa: C901 modified: bool = True, same: bool = True, status_col: Optional[str] = None, + sys: Optional[bool] = False, ) -> "DataChain": """Comparing two chains by identifying rows that are added, deleted, modified or same""" @@ -137,10 +138,13 @@ def _to_list(obj: Optional[Union[str, Sequence[str]]]) -> Optional[list[str]]: for c in [c for c in cols if c in right_cols] } ) - .settings(sys=True) .select_except(ldiff_col, rdiff_col) ) + if sys: + # making sure we have sys signals in final diff chain + dc_diff = dc_diff.settings(sys=True) + if not added: dc_diff = dc_diff.filter(C(diff_col) != CompareStatus.ADDED) if not modified: diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py index 840959a71..571c6014c 100644 --- a/src/datachain/lib/dc.py +++ b/src/datachain/lib/dc.py @@ -339,7 +339,7 @@ def append_steps(self, chain: "DataChain") -> "Self": Steps are all those modification methods applied like filters, mappers etc. """ dc = self.clone() - dc._query.steps += chain._query.steps + dc._query.steps += chain._query.steps.copy() dc.signals_schema = dc.signals_schema.append(chain.signals_schema) return dc @@ -1648,6 +1648,7 @@ def compare( modified: bool = True, same: bool = False, status_col: Optional[str] = None, + sys: Optional[bool] = False, ) -> "DataChain": """Comparing two chains by identifying rows that are added, deleted, modified or same. Result is the new chain that has additional column with possible @@ -1680,6 +1681,7 @@ def compare( same (bool): Whether to return unchanged rows in resulting chain. status_col (str): Name of the new column that is created in resulting chain representing diff status. + sys (bool): Whether to have sys columns in returned diff chain or not. Example: ```py @@ -1710,6 +1712,7 @@ def compare( modified=modified, same=same, status_col=status_col, + sys=sys, ) def diff( @@ -1722,6 +1725,7 @@ def diff( deleted: bool = False, same: bool = False, status_col: Optional[str] = None, + sys: Optional[bool] = False, ) -> "DataChain": """Similar to `.compare()`, which is more generic method to calculate difference between two chains. Unlike `.compare()`, this method works only on those chains @@ -1744,6 +1748,7 @@ def diff( same (bool): Whether to return unchanged rows in resulting chain. status_col (str): Optional name of the new column that is created in resulting chain representing diff status. + sys (bool): Whether to have sys columns in returned diff chain or not. Example: ```py @@ -1783,6 +1788,7 @@ def get_file_signals(file: str, signals): modified=modified, same=same, status_col=status_col, + sys=sys, ) @classmethod From d7b86233aac1b4c51561c6c45fc3122fcb1ce36c Mon Sep 17 00:00:00 2001 From: ilongin Date: Tue, 11 Mar 2025 10:48:34 +0100 Subject: [PATCH 17/45] removed append steps from DataChain --- src/datachain/delta.py | 26 +++++++++++++++++--------- src/datachain/lib/dc.py | 9 --------- tests/unit/lib/test_datachain.py | 18 ------------------ 3 files changed, 17 insertions(+), 36 deletions(-) diff --git a/src/datachain/delta.py b/src/datachain/delta.py index b525252ca..c570aa4d1 100644 --- a/src/datachain/delta.py +++ b/src/datachain/delta.py @@ -6,6 +6,16 @@ from datachain.lib.dc import DataChain +def _append_steps(dc: "DataChain", other: "DataChain"): + """Returns cloned chain with appended steps from other chain. + Steps are all those modification methods applied like filters, mappers etc. + """ + dc = dc.clone() + dc._query.steps += other._query.steps.copy() + dc.signals_schema = dc.signals_schema.append(other.signals_schema) + return dc + + def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]: """ Creates new chain that consists of the last version of current delta dataset @@ -28,17 +38,15 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]: source_ds_name = dc._query.starting_step.dataset_name source_ds_version = dc._query.starting_step.dataset_version - diff = ( - DataChain.from_dataset(source_ds_name, version=source_ds_version) - .diff( - DataChain.from_dataset(name, version=latest_version), - on=file_signal, - sys=True, - ) - # We append all the steps from the original chain to diff, e.g filters, mappers. - .append_steps(dc) + diff = DataChain.from_dataset(source_ds_name, version=source_ds_version).diff( + DataChain.from_dataset(name, version=latest_version), + on=file_signal, + sys=True, ) + # We append all the steps from the original chain to diff, e.g filters, mappers. + diff = _append_steps(diff, dc) + # merging diff and the latest version of dataset return ( DataChain.from_dataset(name, latest_version) diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py index 53a4f0625..fac14fc9e 100644 --- a/src/datachain/lib/dc.py +++ b/src/datachain/lib/dc.py @@ -334,15 +334,6 @@ def clone(self) -> "Self": """Make a copy of the chain in a new table.""" return self._evolve(query=self._query.clone(new_table=True)) - def append_steps(self, chain: "DataChain") -> "Self": - """Returns cloned chain with appended steps from other chain. - Steps are all those modification methods applied like filters, mappers etc. - """ - dc = self.clone() - dc._query.steps += chain._query.steps.copy() - dc.signals_schema = dc.signals_schema.append(chain.signals_schema) - return dc - def _evolve( self, *, diff --git a/tests/unit/lib/test_datachain.py b/tests/unit/lib/test_datachain.py index 3a10e8616..d5b442edd 100644 --- a/tests/unit/lib/test_datachain.py +++ b/tests/unit/lib/test_datachain.py @@ -2998,21 +2998,3 @@ def test_window_error(test_session): ), ): dc.mutate(first=func.sum("col2").over(window)) - - -def test_append_steps(test_session): - keys = ["a", "b", "c", "d"] - values = [1, 2, 3, 4] - - DataChain.from_values(key=keys, val=values, session=test_session).save("ds") - - ds1 = ( - DataChain.from_dataset("ds", session=test_session) - .filter(C("val") > 2) - .mutate(double=C("val") * 2) - ) - - ds2 = DataChain.from_dataset("ds", session=test_session).append_steps(ds1) - - assert list(ds2.order_by("val").collect("val")) == [3, 4] - assert list(ds2.order_by("val").collect("double")) == [6, 8] From 0464c165c52418028471ed08d256c1d9be6ea8af Mon Sep 17 00:00:00 2001 From: ilongin Date: Tue, 11 Mar 2025 13:02:52 +0100 Subject: [PATCH 18/45] added better docs --- src/datachain/lib/dc.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py index fac14fc9e..aec6a6343 100644 --- a/src/datachain/lib/dc.py +++ b/src/datachain/lib/dc.py @@ -774,10 +774,19 @@ def save( # type: ignore[override] removed after process ends. Temp dataset are useful for optimization. version : version of a dataset. Default - the last version that exist. delta : If True, we optimize on creation of the new dataset versions - by calculating diff between source and the last version and applying - all needed modifications (mappers, filters etc.) only on that diff. - At the end, we merge modified diff with last version of dataset to - create new version. + by calculating diff between source and the last version of dataset + and applying all needed modifications (mappers, filters etc.) only + on that diff. + Then we merge modified diff with the last version of dataset to + create new version. This way we avoid applying modifications to all + records from source every time since that can be expensive operation. + Source can be cloud storage or other dataset which has File object + in schema. + Diff is calculated using `DataChain.diff()` method which looks into + File `source` and `path` for matching, and File `version` and `etag` + for checking if the record is changed. + Note that this takes in account only added and changed records in + source while deleted recordsare not removed in the new dataset version. """ schema = self.signals_schema.clone_without_sys_signals().serialize() if delta and name: From 8093000db38627e5b68390a6b0f0e62d8324204e Mon Sep 17 00:00:00 2001 From: ilongin Date: Wed, 12 Mar 2025 16:26:10 +0100 Subject: [PATCH 19/45] removed sys flag --- src/datachain/delta.py | 3 +-- src/datachain/diff/__init__.py | 13 ++++++------- src/datachain/lib/dc.py | 6 ------ 3 files changed, 7 insertions(+), 15 deletions(-) diff --git a/src/datachain/delta.py b/src/datachain/delta.py index c570aa4d1..e15f79b0b 100644 --- a/src/datachain/delta.py +++ b/src/datachain/delta.py @@ -41,7 +41,6 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]: diff = DataChain.from_dataset(source_ds_name, version=source_ds_version).diff( DataChain.from_dataset(name, version=latest_version), on=file_signal, - sys=True, ) # We append all the steps from the original chain to diff, e.g filters, mappers. @@ -50,6 +49,6 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]: # merging diff and the latest version of dataset return ( DataChain.from_dataset(name, latest_version) - .diff(diff, added=True, modified=False, sys=True) + .diff(diff, added=True, modified=False) .union(diff) ) diff --git a/src/datachain/diff/__init__.py b/src/datachain/diff/__init__.py index d09931851..511bc044b 100644 --- a/src/datachain/diff/__init__.py +++ b/src/datachain/diff/__init__.py @@ -30,7 +30,7 @@ class CompareStatus(str, Enum): SAME = "S" -def _compare( # noqa: C901 +def _compare( # noqa: C901, PLR0912 left: "DataChain", right: "DataChain", on: Union[str, Sequence[str]], @@ -42,7 +42,6 @@ def _compare( # noqa: C901 modified: bool = True, same: bool = True, status_col: Optional[str] = None, - sys: Optional[bool] = False, ) -> "DataChain": """Comparing two chains by identifying rows that are added, deleted, modified or same""" @@ -141,10 +140,6 @@ def _to_list(obj: Optional[Union[str, Sequence[str]]]) -> Optional[list[str]]: .select_except(ldiff_col, rdiff_col) ) - if sys: - # making sure we have sys signals in final diff chain - dc_diff = dc_diff.settings(sys=True) - if not added: dc_diff = dc_diff.filter(C(diff_col) != CompareStatus.ADDED) if not modified: @@ -157,7 +152,11 @@ def _to_list(obj: Optional[Union[str, Sequence[str]]]) -> Optional[list[str]]: if status_col: cols.append(diff_col) # type: ignore[arg-type] - dc_diff = dc_diff.select(*cols) + if not dc_diff._sys: + # TODO workaround when sys signal is not available in diff + dc_diff = dc_diff.settings(sys=True).select(*cols).settings(sys=False) + else: + dc_diff = dc_diff.select(*cols) # final schema is schema from the left chain with status column added if needed dc_diff.signals_schema = ( diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py index aec6a6343..bb9a0f683 100644 --- a/src/datachain/lib/dc.py +++ b/src/datachain/lib/dc.py @@ -1648,7 +1648,6 @@ def compare( modified: bool = True, same: bool = False, status_col: Optional[str] = None, - sys: Optional[bool] = False, ) -> "DataChain": """Comparing two chains by identifying rows that are added, deleted, modified or same. Result is the new chain that has additional column with possible @@ -1681,7 +1680,6 @@ def compare( same (bool): Whether to return unchanged rows in resulting chain. status_col (str): Name of the new column that is created in resulting chain representing diff status. - sys (bool): Whether to have sys columns in returned diff chain or not. Example: ```py @@ -1712,7 +1710,6 @@ def compare( modified=modified, same=same, status_col=status_col, - sys=sys, ) def diff( @@ -1725,7 +1722,6 @@ def diff( deleted: bool = False, same: bool = False, status_col: Optional[str] = None, - sys: Optional[bool] = False, ) -> "DataChain": """Similar to `.compare()`, which is more generic method to calculate difference between two chains. Unlike `.compare()`, this method works only on those chains @@ -1748,7 +1744,6 @@ def diff( same (bool): Whether to return unchanged rows in resulting chain. status_col (str): Optional name of the new column that is created in resulting chain representing diff status. - sys (bool): Whether to have sys columns in returned diff chain or not. Example: ```py @@ -1788,7 +1783,6 @@ def get_file_signals(file: str, signals): modified=modified, same=same, status_col=status_col, - sys=sys, ) @classmethod From 0dd71a2d9433b7b79344ebd60cb702fd402b9356 Mon Sep 17 00:00:00 2001 From: ilongin Date: Fri, 14 Mar 2025 08:56:21 +0100 Subject: [PATCH 20/45] fixing typo --- src/datachain/lib/dc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py index 6e4821b7b..2c8fbd873 100644 --- a/src/datachain/lib/dc.py +++ b/src/datachain/lib/dc.py @@ -799,7 +799,7 @@ def save( # type: ignore[override] File `source` and `path` for matching, and File `version` and `etag` for checking if the record is changed. Note that this takes in account only added and changed records in - source while deleted recordsare not removed in the new dataset version. + source while deleted records are not removed in the new dataset version. """ schema = self.signals_schema.clone_without_sys_signals().serialize() if delta and name: From 2b29498504fd12b958df9f6bc90df7b8fb6390df Mon Sep 17 00:00:00 2001 From: ilongin Date: Mon, 24 Mar 2025 16:08:21 +0100 Subject: [PATCH 21/45] added alternative delta approach --- src/datachain/dataset.py | 17 +++++++--------- src/datachain/delta.py | 42 ++++++++++++++++++++++++++++++++++++++++ src/datachain/lib/dc.py | 4 ++-- 3 files changed, 51 insertions(+), 12 deletions(-) diff --git a/src/datachain/dataset.py b/src/datachain/dataset.py index 2fd718686..59602987a 100644 --- a/src/datachain/dataset.py +++ b/src/datachain/dataset.py @@ -105,24 +105,21 @@ def parse( dataset_version: Optional[int], dataset_version_created_at: Optional[datetime], ) -> Optional["DatasetDependency"]: - from datachain.client import Client - from datachain.lib.listing import is_listing_dataset, listing_uri_from_name + from datachain.lib.listing import is_listing_dataset if not dataset_id: return None assert dataset_name is not None - dependency_type = DatasetDependencyType.DATASET - dependency_name = dataset_name - - if is_listing_dataset(dataset_name): - dependency_type = DatasetDependencyType.STORAGE # type: ignore[arg-type] - dependency_name, _ = Client.parse_url(listing_uri_from_name(dataset_name)) return cls( id, - dependency_type, - dependency_name, + ( + DatasetDependencyType.STORAGE + if is_listing_dataset(dataset_name) + else DatasetDependencyType.DATASET + ), + dataset_name, ( str(dataset_version) # type: ignore[arg-type] if dataset_version diff --git a/src/datachain/delta.py b/src/datachain/delta.py index e15f79b0b..4b39defc3 100644 --- a/src/datachain/delta.py +++ b/src/datachain/delta.py @@ -52,3 +52,45 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]: .diff(diff, added=True, modified=False) .union(diff) ) + + +def delta_update_alternative(dc: "DataChain", name: str) -> Optional["DataChain"]: + from datachain.lib.dc import DataChain + + catalog = dc.session.catalog + try: + latest_version = catalog.get_dataset(name).latest_version + except DatasetNotFoundError: + # first creation of delta update dataset + return None + + dependencies = catalog.get_dataset_dependencies(name, latest_version) + if len(dependencies) > 1: + raise Exception("Cannot do delta with dataset that has multiple dependencies") + + dep = dependencies[0] + if not dep: + # starting dataset (e.g listing) was removed so we are backing off to normal + # dataset creation, as it was created first time + return None + + source_ds_name = dep.name + source_ds_version = int(dep.version) + source_ds_latest_version = catalog.get_dataset(source_ds_name).latest_version + + source_dc = DataChain.from_dataset(source_ds_name, source_ds_version) + source_dc_latest = DataChain.from_dataset(source_ds_name, source_ds_latest_version) + file_signal = source_dc.signals_schema.get_file_signal() + if not file_signal: + raise ValueError("Datasets without file signals cannot have delta updates") + + diff = source_dc_latest.diff(source_dc, on=file_signal) + # We append all the steps from the original chain to diff, e.g filters, mappers. + diff = _append_steps(diff, dc) + + # merging diff and the latest version of dataset + return ( + DataChain.from_dataset(name, latest_version) + .diff(diff, added=True, modified=False) + .union(diff) + ) diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py index 1b182b236..20a34e539 100644 --- a/src/datachain/lib/dc.py +++ b/src/datachain/lib/dc.py @@ -26,7 +26,7 @@ from tqdm import tqdm from datachain.dataset import DatasetRecord -from datachain.delta import delta_update +from datachain.delta import delta_update_alternative from datachain.func import literal from datachain.func.base import Function from datachain.func.func import Func @@ -821,7 +821,7 @@ def save( # type: ignore[override] """ schema = self.signals_schema.clone_without_sys_signals().serialize() if delta and name: - delta_ds = delta_update(self, name) + delta_ds = delta_update_alternative(self, name) if delta_ds: return self._evolve( query=delta_ds._query.save( From e085280366b9da9b60a42273de3d664be31bb739 Mon Sep 17 00:00:00 2001 From: ilongin Date: Tue, 15 Apr 2025 03:38:39 +0200 Subject: [PATCH 22/45] fixing delta due to lazy listing changes --- src/datachain/delta.py | 18 +++++++------- src/datachain/lib/dc/datachain.py | 4 ++-- src/datachain/query/dataset.py | 25 +++++++++++++------- tests/func/test_delta.py | 39 +++++++++++++------------------ 4 files changed, 44 insertions(+), 42 deletions(-) diff --git a/src/datachain/delta.py b/src/datachain/delta.py index 4b39defc3..ab41b91e0 100644 --- a/src/datachain/delta.py +++ b/src/datachain/delta.py @@ -1,5 +1,6 @@ from typing import TYPE_CHECKING, Optional +import datachain from datachain.error import DatasetNotFoundError if TYPE_CHECKING: @@ -24,7 +25,7 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]: apply all the DataChain methods like filters, mappers, generators etc.) but just the diff part which is very important for performance. """ - from datachain.lib.dc import DataChain + dc._query.apply_listing_pre_step() file_signal = dc.signals_schema.get_file_signal() if not file_signal: @@ -35,11 +36,12 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]: # first creation of delta update dataset return None + assert dc._query.starting_step source_ds_name = dc._query.starting_step.dataset_name source_ds_version = dc._query.starting_step.dataset_version - diff = DataChain.from_dataset(source_ds_name, version=source_ds_version).diff( - DataChain.from_dataset(name, version=latest_version), + diff = datachain.read_dataset(source_ds_name, version=source_ds_version).diff( + datachain.read_dataset(name, version=latest_version), on=file_signal, ) @@ -48,15 +50,13 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]: # merging diff and the latest version of dataset return ( - DataChain.from_dataset(name, latest_version) + datachain.read_dataset(name, latest_version) .diff(diff, added=True, modified=False) .union(diff) ) def delta_update_alternative(dc: "DataChain", name: str) -> Optional["DataChain"]: - from datachain.lib.dc import DataChain - catalog = dc.session.catalog try: latest_version = catalog.get_dataset(name).latest_version @@ -78,8 +78,8 @@ def delta_update_alternative(dc: "DataChain", name: str) -> Optional["DataChain" source_ds_version = int(dep.version) source_ds_latest_version = catalog.get_dataset(source_ds_name).latest_version - source_dc = DataChain.from_dataset(source_ds_name, source_ds_version) - source_dc_latest = DataChain.from_dataset(source_ds_name, source_ds_latest_version) + source_dc = datachain.read_dataset(source_ds_name, source_ds_version) + source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version) file_signal = source_dc.signals_schema.get_file_signal() if not file_signal: raise ValueError("Datasets without file signals cannot have delta updates") @@ -90,7 +90,7 @@ def delta_update_alternative(dc: "DataChain", name: str) -> Optional["DataChain" # merging diff and the latest version of dataset return ( - DataChain.from_dataset(name, latest_version) + datachain.read_dataset(name, latest_version) .diff(diff, added=True, modified=False) .union(diff) ) diff --git a/src/datachain/lib/dc/datachain.py b/src/datachain/lib/dc/datachain.py index 205c53fff..a09832c8f 100644 --- a/src/datachain/lib/dc/datachain.py +++ b/src/datachain/lib/dc/datachain.py @@ -24,7 +24,7 @@ from tqdm import tqdm from datachain.dataset import DatasetRecord -from datachain.delta import delta_update_alternative +from datachain.delta import delta_update from datachain.func import literal from datachain.func.base import Function from datachain.func.func import Func @@ -488,7 +488,7 @@ def save( # type: ignore[override] """ schema = self.signals_schema.clone_without_sys_signals().serialize() if delta and name: - delta_ds = delta_update_alternative(self, name) + delta_ds = delta_update(self, name) if delta_ds: return self._evolve( query=delta_ds._query.save( diff --git a/src/datachain/query/dataset.py b/src/datachain/query/dataset.py index 49297e769..ea881f13e 100644 --- a/src/datachain/query/dataset.py +++ b/src/datachain/query/dataset.py @@ -1113,9 +1113,14 @@ def __init__( self.version = version if is_listing_dataset(name): - # not setting query step yet as listing dataset might not exist at - # this point - self.list_ds_name = name + if version: + # this listing dataset should already be listed as we specify + # exact version + self._set_starting_step(self.catalog.get_dataset(name)) + else: + # not setting query step yet as listing dataset might not exist at + # this point + self.list_ds_name = name elif fallback_to_studio and is_token_set(): self._set_starting_step( self.catalog.get_dataset_with_remote_fallback(name, version) @@ -1201,11 +1206,8 @@ def set_listing_fn(self, fn: Callable) -> None: """Setting listing function to be run if needed""" self.listing_fn = fn - def apply_steps(self) -> QueryGenerator: - """ - Apply the steps in the query and return the resulting - sqlalchemy.SelectBase. - """ + def apply_listing_pre_step(self) -> None: + """Runs listing pre-step if needed""" if self.list_ds_name and not self.starting_step: listing_ds = None try: @@ -1221,6 +1223,13 @@ def apply_steps(self) -> QueryGenerator: # at this point we know what is our starting listing dataset name self._set_starting_step(listing_ds) # type: ignore [arg-type] + def apply_steps(self) -> QueryGenerator: + """ + Apply the steps in the query and return the resulting + sqlalchemy.SelectBase. + """ + self.apply_listing_pre_step() + query = self.clone() index = os.getenv("DATACHAIN_QUERY_CHUNK_INDEX", self._chunk_index) diff --git a/tests/func/test_delta.py b/tests/func/test_delta.py index 96cd9792e..5c4988f3a 100644 --- a/tests/func/test_delta.py +++ b/tests/func/test_delta.py @@ -4,8 +4,9 @@ import regex as re from PIL import Image +import datachain as dc from datachain import func -from datachain.lib.dc import C, DataChain +from datachain.lib.dc import C from datachain.lib.file import File, ImageFile @@ -21,7 +22,7 @@ def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path): ] def create_image_dataset(ds_name, images): - DataChain.from_values( + dc.read_values( file=[ ImageFile(path=img["name"], source=f"file://{tmp_path}") for img in images @@ -30,7 +31,7 @@ def create_image_dataset(ds_name, images): ).save(ds_name) def create_delta_dataset(ds_name): - DataChain.from_dataset( + dc.read_dataset( starting_ds_name, session=test_session, ).save(ds_name, delta=True) @@ -45,18 +46,14 @@ def create_delta_dataset(ds_name): create_delta_dataset(ds_name) assert list( - DataChain.from_dataset(ds_name, version=1) - .order_by("file.path") - .collect("file.path") + dc.read_dataset(ds_name, version=1).order_by("file.path").collect("file.path") ) == [ "img1.jpg", "img2.jpg", ] assert list( - DataChain.from_dataset(ds_name, version=2) - .order_by("file.path") - .collect("file.path") + dc.read_dataset(ds_name, version=2).order_by("file.path").collect("file.path") ) == [ "img1.jpg", "img2.jpg", @@ -92,7 +89,7 @@ def get_index(file: File) -> int: return int(re.search(r, file.path).group(1)) # type: ignore[union-attr] ( - DataChain.from_storage(path, update=True, session=test_session) + dc.read_storage(path, update=True, session=test_session) .filter(C("file.path").glob("*.jpg")) .map(emb=my_embedding) .mutate(dist=func.cosine_distance("emb", (0.1, 0.2))) @@ -108,7 +105,7 @@ def get_index(file: File) -> int: # into consideration on delta update etags = { r[0]: r[1].etag - for r in DataChain.from_dataset(ds_name, version=1).collect("index", "file") + for r in dc.read_dataset(ds_name, version=1).collect("index", "file") } # remove last couple of images to simulate modification since we will re-create it @@ -123,9 +120,7 @@ def get_index(file: File) -> int: create_delta_dataset() assert list( - DataChain.from_dataset(ds_name, version=1) - .order_by("file.path") - .collect("file.path") + dc.read_dataset(ds_name, version=1).order_by("file.path").collect("file.path") ) == [ "images/img4.jpg", "images/img6.jpg", @@ -133,9 +128,7 @@ def get_index(file: File) -> int: ] assert list( - DataChain.from_dataset(ds_name, version=2) - .order_by("file.path") - .collect("file.path") + dc.read_dataset(ds_name, version=2).order_by("file.path").collect("file.path") ) == [ "images/img10.jpg", "images/img12.jpg", @@ -151,7 +144,7 @@ def get_index(file: File) -> int: # and modified rows etags should be bigger than the old ones assert ( next( - DataChain.from_dataset(ds_name, version=2) + dc.read_dataset(ds_name, version=2) .filter(C("index") == 6) .order_by("file.path", "file.etag") .collect("file.etag") @@ -180,7 +173,7 @@ def get_index(file: File) -> int: return int(re.search(r, file.path).group(1)) # type: ignore[union-attr] ( - DataChain.from_storage(path, update=True, session=test_session) + dc.read_storage(path, update=True, session=test_session) .filter(C("file.path").glob("*.jpg")) .map(index=get_index) .filter(C("index") > 5) @@ -192,12 +185,12 @@ def get_index(file: File) -> int: assert ( list( - DataChain.from_dataset(ds_name, version=1) + dc.read_dataset(ds_name, version=1) .order_by("file.path") .collect("file.path") ) == list( - DataChain.from_dataset(ds_name, version=2) + dc.read_dataset(ds_name, version=2) .order_by("file.path") .collect("file.path") ) @@ -213,10 +206,10 @@ def get_index(file: File) -> int: def test_delta_update_no_file_signals(test_session): starting_ds_name = "starting_ds" - DataChain.from_values(num=[10, 20], session=test_session).save(starting_ds_name) + dc.read_values(num=[10, 20], session=test_session).save(starting_ds_name) with pytest.raises(ValueError) as excinfo: - DataChain.from_dataset( + dc.read_dataset( starting_ds_name, session=test_session, ).save("delta_ds", delta=True) From 735af026a7598fd097d7427fd92ef1424af8b6ce Mon Sep 17 00:00:00 2001 From: ilongin Date: Tue, 15 Apr 2025 10:41:20 +0200 Subject: [PATCH 23/45] fixing datasetdependencies --- src/datachain/dataset.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/datachain/dataset.py b/src/datachain/dataset.py index 59602987a..2fd718686 100644 --- a/src/datachain/dataset.py +++ b/src/datachain/dataset.py @@ -105,21 +105,24 @@ def parse( dataset_version: Optional[int], dataset_version_created_at: Optional[datetime], ) -> Optional["DatasetDependency"]: - from datachain.lib.listing import is_listing_dataset + from datachain.client import Client + from datachain.lib.listing import is_listing_dataset, listing_uri_from_name if not dataset_id: return None assert dataset_name is not None + dependency_type = DatasetDependencyType.DATASET + dependency_name = dataset_name + + if is_listing_dataset(dataset_name): + dependency_type = DatasetDependencyType.STORAGE # type: ignore[arg-type] + dependency_name, _ = Client.parse_url(listing_uri_from_name(dataset_name)) return cls( id, - ( - DatasetDependencyType.STORAGE - if is_listing_dataset(dataset_name) - else DatasetDependencyType.DATASET - ), - dataset_name, + dependency_type, + dependency_name, ( str(dataset_version) # type: ignore[arg-type] if dataset_version From f3ebf97397f4c9288b0799cd3c69008d99cfef26 Mon Sep 17 00:00:00 2001 From: ilongin Date: Tue, 15 Apr 2025 11:47:55 +0200 Subject: [PATCH 24/45] returning function --- src/datachain/lib/signal_schema.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/datachain/lib/signal_schema.py b/src/datachain/lib/signal_schema.py index b011949cf..fd9a17e1a 100644 --- a/src/datachain/lib/signal_schema.py +++ b/src/datachain/lib/signal_schema.py @@ -461,13 +461,14 @@ def row_to_objs(self, row: Sequence[Any]) -> list[DataValue]: pos += 1 return objs - def get_file_signal(self) -> Optional[str]: - for signal_name, signal_type in self.values.items(): - if (fr := ModelStore.to_pydantic(signal_type)) is not None and issubclass( + def contains_file(self) -> bool: + for type_ in self.values.values(): + if (fr := ModelStore.to_pydantic(type_)) is not None and issubclass( fr, File ): - return signal_name - return None + return True + + return False def slice( self, From 59b7666b0119042554104e3767b230ceb6f1c4e7 Mon Sep 17 00:00:00 2001 From: ilongin Date: Tue, 15 Apr 2025 16:09:06 +0200 Subject: [PATCH 25/45] renaming method --- src/datachain/lib/signal_schema.py | 11 +++++------ tests/unit/lib/test_signal_schema.py | 11 ----------- 2 files changed, 5 insertions(+), 17 deletions(-) diff --git a/src/datachain/lib/signal_schema.py b/src/datachain/lib/signal_schema.py index fd9a17e1a..b011949cf 100644 --- a/src/datachain/lib/signal_schema.py +++ b/src/datachain/lib/signal_schema.py @@ -461,14 +461,13 @@ def row_to_objs(self, row: Sequence[Any]) -> list[DataValue]: pos += 1 return objs - def contains_file(self) -> bool: - for type_ in self.values.values(): - if (fr := ModelStore.to_pydantic(type_)) is not None and issubclass( + def get_file_signal(self) -> Optional[str]: + for signal_name, signal_type in self.values.items(): + if (fr := ModelStore.to_pydantic(signal_type)) is not None and issubclass( fr, File ): - return True - - return False + return signal_name + return None def slice( self, diff --git a/tests/unit/lib/test_signal_schema.py b/tests/unit/lib/test_signal_schema.py index 26ddc7b5d..2eb7fb769 100644 --- a/tests/unit/lib/test_signal_schema.py +++ b/tests/unit/lib/test_signal_schema.py @@ -1041,17 +1041,6 @@ def test_get_flatten_hidden_fields(schema, hidden_fields): assert SignalSchema.get_flatten_hidden_fields(schema_serialized) == hidden_fields -@pytest.mark.parametrize( - "schema,result", - [ - ({"name": str, "value": int}, False), - ({"name": str, "age": float, "f": File}, True), - ], -) -def test_contains_file(schema, result): - assert SignalSchema(schema).contains_file() is result - - def test_slice(): schema = {"name": str, "age": float, "address": str} setup_values = {"init": lambda: 37} From 7d0a28308b879bd580859051307dc59fab95cd63 Mon Sep 17 00:00:00 2001 From: ilongin Date: Tue, 22 Apr 2025 16:21:42 +0200 Subject: [PATCH 26/45] leaving only alternative implementation --- src/datachain/dataset.py | 17 +++++------- src/datachain/delta.py | 56 ++++++++++++++-------------------------- tests/func/test_delta.py | 3 ++- 3 files changed, 29 insertions(+), 47 deletions(-) diff --git a/src/datachain/dataset.py b/src/datachain/dataset.py index 38d53fdcc..8934d34a6 100644 --- a/src/datachain/dataset.py +++ b/src/datachain/dataset.py @@ -105,24 +105,21 @@ def parse( dataset_version: Optional[int], dataset_version_created_at: Optional[datetime], ) -> Optional["DatasetDependency"]: - from datachain.client import Client - from datachain.lib.listing import is_listing_dataset, listing_uri_from_name + from datachain.lib.listing import is_listing_dataset if not dataset_id: return None assert dataset_name is not None - dependency_type = DatasetDependencyType.DATASET - dependency_name = dataset_name - - if is_listing_dataset(dataset_name): - dependency_type = DatasetDependencyType.STORAGE # type: ignore[arg-type] - dependency_name, _ = Client.parse_url(listing_uri_from_name(dataset_name)) return cls( id, - dependency_type, - dependency_name, + ( + DatasetDependencyType.STORAGE + if is_listing_dataset(dataset_name) + else DatasetDependencyType.DATASET + ), + dataset_name, ( str(dataset_version) # type: ignore[arg-type] if dataset_version diff --git a/src/datachain/delta.py b/src/datachain/delta.py index ab41b91e0..f375e4501 100644 --- a/src/datachain/delta.py +++ b/src/datachain/delta.py @@ -24,40 +24,16 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]: This way we don't need to re-calculate the whole chain from the source again( apply all the DataChain methods like filters, mappers, generators etc.) but just the diff part which is very important for performance. + + Note that currently delta update works only if there is only one direct dependency. """ + catalog = dc.session.catalog dc._query.apply_listing_pre_step() - file_signal = dc.signals_schema.get_file_signal() - if not file_signal: - raise ValueError("Datasets without file signals cannot have delta updates") - try: - latest_version = dc.session.catalog.get_dataset(name).latest_version - except DatasetNotFoundError: - # first creation of delta update dataset - return None - - assert dc._query.starting_step - source_ds_name = dc._query.starting_step.dataset_name - source_ds_version = dc._query.starting_step.dataset_version - - diff = datachain.read_dataset(source_ds_name, version=source_ds_version).diff( - datachain.read_dataset(name, version=latest_version), - on=file_signal, - ) - - # We append all the steps from the original chain to diff, e.g filters, mappers. - diff = _append_steps(diff, dc) - - # merging diff and the latest version of dataset - return ( - datachain.read_dataset(name, latest_version) - .diff(diff, added=True, modified=False) - .union(diff) - ) + chain_file_signal = dc.signals_schema.get_file_signal() + if not chain_file_signal: + raise ValueError("Chain doesn't produce file signal, cannot do delta update") - -def delta_update_alternative(dc: "DataChain", name: str) -> Optional["DataChain"]: - catalog = dc.session.catalog try: latest_version = catalog.get_dataset(name).latest_version except DatasetNotFoundError: @@ -66,7 +42,9 @@ def delta_update_alternative(dc: "DataChain", name: str) -> Optional["DataChain" dependencies = catalog.get_dataset_dependencies(name, latest_version) if len(dependencies) > 1: - raise Exception("Cannot do delta with dataset that has multiple dependencies") + raise Exception( + "Cannot do delta with dataset that has multiple direct dependencies" + ) dep = dependencies[0] if not dep: @@ -80,17 +58,23 @@ def delta_update_alternative(dc: "DataChain", name: str) -> Optional["DataChain" source_dc = datachain.read_dataset(source_ds_name, source_ds_version) source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version) - file_signal = source_dc.signals_schema.get_file_signal() - if not file_signal: - raise ValueError("Datasets without file signals cannot have delta updates") + source_file_signal = source_dc.signals_schema.get_file_signal() + if not source_file_signal: + raise ValueError("Source dataset doesn't have file signals") - diff = source_dc_latest.diff(source_dc, on=file_signal) + diff = source_dc_latest.diff(source_dc, on=source_file_signal) # We append all the steps from the original chain to diff, e.g filters, mappers. diff = _append_steps(diff, dc) # merging diff and the latest version of dataset return ( datachain.read_dataset(name, latest_version) - .diff(diff, added=True, modified=False) + .diff( + diff, + on=chain_file_signal, + right_on=source_file_signal, + added=True, + modified=False, + ) .union(diff) ) diff --git a/tests/func/test_delta.py b/tests/func/test_delta.py index 5c4988f3a..74d7f78d0 100644 --- a/tests/func/test_delta.py +++ b/tests/func/test_delta.py @@ -215,5 +215,6 @@ def test_delta_update_no_file_signals(test_session): ).save("delta_ds", delta=True) assert ( - str(excinfo.value) == "Datasets without file signals cannot have delta updates" + str(excinfo.value) + == "Chain doesn't produce file signal, cannot do delta update" ) From 95a206bf787dd9ac282dee8e3b923034138b326a Mon Sep 17 00:00:00 2001 From: ilongin Date: Wed, 23 Apr 2025 01:52:11 +0200 Subject: [PATCH 27/45] fixing tests --- tests/func/test_dataset_query.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/tests/func/test_dataset_query.py b/tests/func/test_dataset_query.py index 093b7d797..7af2ddc26 100644 --- a/tests/func/test_dataset_query.py +++ b/tests/func/test_dataset_query.py @@ -10,6 +10,7 @@ from datachain.error import ( DatasetVersionNotFoundError, ) +from datachain.lib.listing import parse_listing_uri from datachain.query import C, DatasetQuery, Object, Stream from datachain.sql.functions import path as pathfunc from datachain.sql.types import String @@ -956,6 +957,9 @@ def test_dataset_dependencies_one_storage_as_dependency( ds_name = uuid.uuid4().hex catalog = cloud_test_catalog.catalog listing = catalog.listings()[0] + dep_name, _, _ = parse_listing_uri( + cloud_test_catalog.src_uri, catalog.client_config + ) DatasetQuery(cats_dataset.name, catalog=catalog).save(ds_name) @@ -968,7 +972,7 @@ def test_dataset_dependencies_one_storage_as_dependency( { "id": ANY, "type": DatasetDependencyType.STORAGE, - "name": cloud_test_catalog.src_uri, + "name": dep_name, "version": str(1), "created_at": listing.created_at, "dependencies": [], @@ -984,6 +988,10 @@ def test_dataset_dependencies_one_registered_dataset_as_dependency( catalog = cloud_test_catalog.catalog listing = catalog.listings()[0] + dep_name, _, _ = parse_listing_uri( + cloud_test_catalog.src_uri, catalog.client_config + ) + DatasetQuery(name=dogs_dataset.name, catalog=catalog).save(ds_name) expected = [ @@ -1002,7 +1010,7 @@ def test_dataset_dependencies_one_registered_dataset_as_dependency( { "id": ANY, "type": DatasetDependencyType.STORAGE, - "name": cloud_test_catalog.src_uri, + "name": dep_name, "version": str(1), "created_at": listing.created_at, "dependencies": [], @@ -1028,6 +1036,9 @@ def test_dataset_dependencies_multiple_direct_dataset_dependencies( ds_name = uuid.uuid4().hex catalog = cloud_test_catalog.catalog listing = catalog.listings()[0] + dep_name, _, _ = parse_listing_uri( + cloud_test_catalog.src_uri, catalog.client_config + ) dogs = DatasetQuery(name=dogs_dataset.name, version=1, catalog=catalog) cats = DatasetQuery(name=cats_dataset.name, version=1, catalog=catalog) @@ -1040,7 +1051,7 @@ def test_dataset_dependencies_multiple_direct_dataset_dependencies( storage_depenedncy = { "id": ANY, "type": DatasetDependencyType.STORAGE, - "name": cloud_test_catalog.src_uri, + "name": dep_name, "version": str(1), "created_at": listing.created_at, "dependencies": [], @@ -1097,6 +1108,9 @@ def test_dataset_dependencies_multiple_union( ds_name = uuid.uuid4().hex catalog = cloud_test_catalog.catalog listing = catalog.listings()[0] + dep_name, _, _ = parse_listing_uri( + cloud_test_catalog.src_uri, catalog.client_config + ) dogs = DatasetQuery(name=dogs_dataset.name, version=1, catalog=catalog) cats = DatasetQuery(name=cats_dataset.name, version=1, catalog=catalog) @@ -1107,7 +1121,7 @@ def test_dataset_dependencies_multiple_union( storage_depenedncy = { "id": ANY, "type": DatasetDependencyType.STORAGE, - "name": cloud_test_catalog.src_uri, + "name": dep_name, "version": str(1), "created_at": listing.created_at, "dependencies": [], From de7232983b8c8e782588bf700e7dc8c8e6087fe5 Mon Sep 17 00:00:00 2001 From: ilongin Date: Wed, 23 Apr 2025 02:02:35 +0200 Subject: [PATCH 28/45] fixing tests --- tests/func/test_datasets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/func/test_datasets.py b/tests/func/test_datasets.py index f47eb8d2b..d55d09423 100644 --- a/tests/func/test_datasets.py +++ b/tests/func/test_datasets.py @@ -854,6 +854,7 @@ def test_dataset_storage_dependencies(cloud_test_catalog, cloud_type, indirect): session = ctc.session catalog = session.catalog uri = cloud_test_catalog.src_uri + dep_name, _, _ = parse_listing_uri(ctc.src_uri, catalog.client_config) ds_name = "some_ds" dc.read_storage(uri, session=session).save(ds_name) @@ -868,7 +869,7 @@ def test_dataset_storage_dependencies(cloud_test_catalog, cloud_type, indirect): { "id": ANY, "type": DatasetDependencyType.STORAGE, - "name": uri, + "name": dep_name, "version": "1", "created_at": lst_dataset.get_version(1).created_at, "dependencies": [], From 55269ab250b099050fa159aa84dfccf7a0a54d43 Mon Sep 17 00:00:00 2001 From: ilongin Date: Wed, 23 Apr 2025 10:55:45 +0200 Subject: [PATCH 29/45] fixing tests --- tests/func/test_datachain.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py index 56094017d..030d125af 100644 --- a/tests/func/test_datachain.py +++ b/tests/func/test_datachain.py @@ -231,15 +231,13 @@ def test_read_storage_dependencies(cloud_test_catalog, cloud_type): ctc = cloud_test_catalog src_uri = ctc.src_uri uri = f"{src_uri}/cats" + dep_name, _, _ = parse_listing_uri(uri, ctc.catalog.client_config) ds_name = "dep" dc.read_storage(uri, session=ctc.session).save(ds_name) dependencies = ctc.session.catalog.get_dataset_dependencies(ds_name, 1) assert len(dependencies) == 1 assert dependencies[0].type == DatasetDependencyType.STORAGE - if cloud_type == "file": - assert dependencies[0].name == uri - else: - assert dependencies[0].name == src_uri + assert dependencies[0].name == dep_name @pytest.mark.parametrize("use_cache", [True, False]) From b7b16bad3e89ad0f2f40a4a69e63376bbc7edf07 Mon Sep 17 00:00:00 2001 From: ilongin Date: Fri, 25 Apr 2025 13:44:09 +0200 Subject: [PATCH 30/45] updating docs --- src/datachain/delta.py | 4 +- src/datachain/lib/dc/datachain.py | 45 ++++++++++++++-- src/datachain/lib/dc/datasets.py | 18 ++++++- src/datachain/lib/dc/storage.py | 20 ++++++- tests/func/test_delta.py | 88 +++++++++++++++++++++++++++---- 5 files changed, 159 insertions(+), 16 deletions(-) diff --git a/src/datachain/delta.py b/src/datachain/delta.py index f375e4501..1fd116a35 100644 --- a/src/datachain/delta.py +++ b/src/datachain/delta.py @@ -40,7 +40,9 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]: # first creation of delta update dataset return None - dependencies = catalog.get_dataset_dependencies(name, latest_version) + dependencies = catalog.get_dataset_dependencies( + name, latest_version, indirect=False + ) if len(dependencies) > 1: raise Exception( "Cannot do delta with dataset that has multiple direct dependencies" diff --git a/src/datachain/lib/dc/datachain.py b/src/datachain/lib/dc/datachain.py index 1ac897993..d0490dd33 100644 --- a/src/datachain/lib/dc/datachain.py +++ b/src/datachain/lib/dc/datachain.py @@ -4,6 +4,7 @@ import sys import warnings from collections.abc import Iterator, Sequence +from functools import wraps from typing import ( IO, TYPE_CHECKING, @@ -67,11 +68,34 @@ if TYPE_CHECKING: import pandas as pd - from typing_extensions import ParamSpec, Self + from typing_extensions import Concatenate, ParamSpec, Self P = ParamSpec("P") +T = TypeVar("T", bound="DataChain") + + +def delta_disabled( + method: "Callable[Concatenate[T, P], T]", +) -> "Callable[Concatenate[T, P], T]": + """ + Decorator for disabling DataChain methods (e.g `.agg()` or `.union()`) to + work with delta updates. It throws `NotImplementedError` if chain on which + method is called is marked as delta. + """ + + @wraps(method) + def _inner(self: T, *args: "P.args", **kwargs: "P.kwargs") -> T: + if self.delta: + raise NotImplementedError( + f"Delta update cannot be used with {method.__name__}" + ) + return method(self, *args, **kwargs) + + return _inner + + class DataChain: """DataChain - a data structure for batch data processing and evaluation. @@ -164,6 +188,7 @@ def __init__( self.signals_schema = signal_schema self._setup: dict = setup or {} self._sys = _sys + self._delta = False def __repr__(self) -> str: """Return a string representation of the chain.""" @@ -177,6 +202,16 @@ def __repr__(self) -> str: self.print_schema(file=file) return file.getvalue() + def as_delta(self, delta: bool = False) -> "Self": + """Marks this chain as delta, which means special delta process will be + called on saving dataset for optimization""" + self._delta = delta + return self + + @property + def delta(self) -> bool: + return self._delta + @property def schema(self) -> dict[str, DataType]: """Get schema of the chain.""" @@ -461,7 +496,6 @@ def save( # type: ignore[override] version: Optional[int] = None, description: Optional[str] = None, attrs: Optional[list[str]] = None, - delta: Optional[bool] = False, **kwargs, ) -> "Self": """Save to a Dataset. It returns the chain itself. @@ -488,7 +522,7 @@ def save( # type: ignore[override] source while deleted records are not removed in the new dataset version. """ schema = self.signals_schema.clone_without_sys_signals().serialize() - if delta and name: + if self.delta and name: delta_ds = delta_update(self, name) if delta_ds: return self._evolve( @@ -620,6 +654,7 @@ def gen( signal_schema=udf_obj.output, ) + @delta_disabled def agg( self, func: Optional[Callable] = None, @@ -773,6 +808,7 @@ def order_by(self, *args, descending: bool = False) -> "Self": return self._evolve(query=self._query.order_by(*args)) + @delta_disabled def distinct(self, arg: str, *args: str) -> "Self": # type: ignore[override] """Removes duplicate rows based on uniqueness of some input column(s) i.e if rows are found with the same value of input column(s), only one @@ -807,6 +843,7 @@ def select_except(self, *args: str) -> "Self": query=self._query.select(*columns), signal_schema=new_schema ) + @delta_disabled # type: ignore[arg-type] def group_by( self, *, @@ -1165,6 +1202,7 @@ def remove_file_signals(self) -> "Self": schema = self.signals_schema.clone_without_file_signals() return self.select(*schema.values.keys()) + @delta_disabled def merge( self, right_ds: "DataChain", @@ -1273,6 +1311,7 @@ def _resolve( return ds + @delta_disabled def union(self, other: "Self") -> "Self": """Return the set union of the two datasets. diff --git a/src/datachain/lib/dc/datasets.py b/src/datachain/lib/dc/datasets.py index 1765a92d7..12b228b95 100644 --- a/src/datachain/lib/dc/datasets.py +++ b/src/datachain/lib/dc/datasets.py @@ -26,6 +26,7 @@ def read_dataset( session: Optional[Session] = None, settings: Optional[dict] = None, fallback_to_studio: bool = True, + delta: bool = False, ) -> "DataChain": """Get data from a saved Dataset. It returns the chain itself. If dataset or version is not found locally, it will try to pull it from Studio. @@ -37,6 +38,21 @@ def read_dataset( settings : Settings to use for the chain. fallback_to_studio : Try to pull dataset from Studio if not found locally. Default is True. + delta : If True, we optimize on creation of the new dataset versions + by calculating diff between last version of this dataset and the version + with which last version of resulting chain dataset (the one specified in + `.save()`) was created. + We then run the "diff" chain with this diff data returned instead of + all dataset data, and we union that diff chain with last version of + resulting dataset creating new version of it. + This way we avoid applying modifications to all records from dataset + every time since that can be expensive operation. + Dataset needs to have File object in schema. + Diff is calculated using `DataChain.diff()` method which looks into + File `source` and `path` for matching, and File `version` and `etag` + for checking if the record is changed. + Note that this takes in account only added and changed records in + dataset while deleted records are not removed in the new dataset version. Example: ```py @@ -92,7 +108,7 @@ def read_dataset( signals_schema |= SignalSchema.deserialize(query.feature_schema) else: signals_schema |= SignalSchema.from_column_types(query.column_types or {}) - return DataChain(query, _settings, signals_schema) + return DataChain(query, _settings, signals_schema).as_delta(delta) def datasets( diff --git a/src/datachain/lib/dc/storage.py b/src/datachain/lib/dc/storage.py index 551ef160a..9814d1f89 100644 --- a/src/datachain/lib/dc/storage.py +++ b/src/datachain/lib/dc/storage.py @@ -32,6 +32,7 @@ def read_storage( column: str = "file", update: bool = False, anon: bool = False, + delta: bool = False, client_config: Optional[dict] = None, ) -> "DataChain": """Get data from storage(s) as a list of file with all file attributes. @@ -47,6 +48,21 @@ def read_storage( update : force storage reindexing. Default is False. anon : If True, we will treat cloud bucket as public one client_config : Optional client configuration for the storage client. + delta : If True, we optimize on creation of the new dataset versions + by calculating diff between last version of this storage and the version + with which last version of resulting chain dataset (the one specified in + `.save()`) was created. + We then run the "diff" chain with this diff data returned instead of + all storage data, and we union that diff chain with last version of + resulting dataset creating new version of it. + This way we avoid applying modifications to all records from storage + every time since that can be expensive operation. + Dataset needs to have File object in schema. + Diff is calculated using `DataChain.diff()` method which looks into + File `source` and `path` for matching, and File `version` and `etag` + for checking if the record is changed. + Note that this takes in account only added and changed records in + storage while deleted records are not removed in the new dataset version. Returns: DataChain: A DataChain object containing the file information. @@ -122,7 +138,7 @@ def read_storage( ) continue - dc = read_dataset(list_ds_name, session=session, settings=settings) + dc = read_dataset(list_ds_name, session=session, settings=settings, delta=delta) dc._query.update = update dc.signals_schema = dc.signals_schema.mutate({f"{column}": file_type}) @@ -151,7 +167,7 @@ def lst_fn(ds_name, lst_uri): chain = ls(dc, list_path, recursive=recursive, column=column) - storage_chain = storage_chain.union(chain) if storage_chain else chain + storage_chain = storage_chain.union(chain) if storage_chain else chain # type: ignore[attr-defined] listed_ds_name.add(list_ds_name) if file_values: diff --git a/tests/func/test_delta.py b/tests/func/test_delta.py index 74d7f78d0..e14944694 100644 --- a/tests/func/test_delta.py +++ b/tests/func/test_delta.py @@ -31,10 +31,9 @@ def create_image_dataset(ds_name, images): ).save(ds_name) def create_delta_dataset(ds_name): - dc.read_dataset( - starting_ds_name, - session=test_session, - ).save(ds_name, delta=True) + dc.read_dataset(starting_ds_name, session=test_session, delta=True).save( + ds_name + ) # first version of starting dataset create_image_dataset(starting_ds_name, images[:2]) @@ -89,13 +88,13 @@ def get_index(file: File) -> int: return int(re.search(r, file.path).group(1)) # type: ignore[union-attr] ( - dc.read_storage(path, update=True, session=test_session) + dc.read_storage(path, update=True, session=test_session, delta=True) .filter(C("file.path").glob("*.jpg")) .map(emb=my_embedding) .mutate(dist=func.cosine_distance("emb", (0.1, 0.2))) .map(index=get_index) .filter(C("index") > 3) - .save(ds_name, delta=True) + .save(ds_name) ) # first version of delta dataset @@ -173,11 +172,11 @@ def get_index(file: File) -> int: return int(re.search(r, file.path).group(1)) # type: ignore[union-attr] ( - dc.read_storage(path, update=True, session=test_session) + dc.read_storage(path, update=True, session=test_session, delta=True) .filter(C("file.path").glob("*.jpg")) .map(index=get_index) .filter(C("index") > 5) - .save(ds_name, delta=True) + .save(ds_name) ) create_delta_dataset() @@ -212,9 +211,80 @@ def test_delta_update_no_file_signals(test_session): dc.read_dataset( starting_ds_name, session=test_session, - ).save("delta_ds", delta=True) + delta=True, + ).save("delta_ds") assert ( str(excinfo.value) == "Chain doesn't produce file signal, cannot do delta update" ) + + +@pytest.fixture +def file_dataset(test_session): + return dc.read_values( + file=[ + File(path="a.jpg", source="s3://bucket"), + File(path="b.jpg", source="s3://bucket"), + ], + session=test_session, + ).save("file_ds") + + +def test_delta_update_union(test_session, file_dataset): + dc.read_values(num=[10, 20], session=test_session).save("numbers") + + with pytest.raises(NotImplementedError) as excinfo: + ( + dc.read_dataset(file_dataset.name, session=test_session, delta=True).union( + dc.read_dataset("numbers"), session=test_session + ) + ) + + assert str(excinfo.value) == "Delta update cannot be used with union" + + +def test_delta_update_merge(test_session, file_dataset): + dc.read_values(num=[10, 20], session=test_session).save("numbers") + + with pytest.raises(NotImplementedError) as excinfo: + ( + dc.read_dataset(file_dataset.name, session=test_session, delta=True).merge( + dc.read_dataset("numbers"), on="id", session=test_session + ) + ) + + assert str(excinfo.value) == "Delta update cannot be used with merge" + + +def test_delta_update_distinct(test_session, file_dataset): + with pytest.raises(NotImplementedError) as excinfo: + ( + dc.read_dataset( + file_dataset.name, session=test_session, delta=True + ).distinct("file.path") + ) + + assert str(excinfo.value) == "Delta update cannot be used with distinct" + + +def test_delta_update_group_by(test_session, file_dataset): + with pytest.raises(NotImplementedError) as excinfo: + ( + dc.read_dataset( + file_dataset.name, session=test_session, delta=True + ).group_by(cnt=func.count(), partition_by="file.path") + ) + + assert str(excinfo.value) == "Delta update cannot be used with group_by" + + +def test_delta_update_agg(test_session, file_dataset): + with pytest.raises(NotImplementedError) as excinfo: + ( + dc.read_dataset(file_dataset.name, session=test_session, delta=True).agg( + cnt=func.count(), partition_by="file.path" + ) + ) + + assert str(excinfo.value) == "Delta update cannot be used with agg" From 723a1a65f9130b93bf13da441d64d738392cb5c7 Mon Sep 17 00:00:00 2001 From: ilongin Date: Fri, 25 Apr 2025 16:23:50 +0200 Subject: [PATCH 31/45] not creating dataset if diff is empty --- src/datachain/delta.py | 11 +++++++---- src/datachain/lib/dc/datachain.py | 21 +++++++++++++++++--- src/datachain/lib/dc/storage.py | 4 ++-- tests/func/test_delta.py | 32 ++++++++++++++----------------- tests/unit/lib/test_datachain.py | 4 ++++ 5 files changed, 45 insertions(+), 27 deletions(-) diff --git a/src/datachain/delta.py b/src/datachain/delta.py index 1fd116a35..a9244e014 100644 --- a/src/datachain/delta.py +++ b/src/datachain/delta.py @@ -17,7 +17,7 @@ def _append_steps(dc: "DataChain", other: "DataChain"): return dc -def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]: +def delta_update(dc: "DataChain", name: str) -> tuple[Optional["DataChain"], bool]: """ Creates new chain that consists of the last version of current delta dataset plus diff from the source with all needed modifications. @@ -38,7 +38,7 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]: latest_version = catalog.get_dataset(name).latest_version except DatasetNotFoundError: # first creation of delta update dataset - return None + return None, True dependencies = catalog.get_dataset_dependencies( name, latest_version, indirect=False @@ -52,7 +52,7 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]: if not dep: # starting dataset (e.g listing) was removed so we are backing off to normal # dataset creation, as it was created first time - return None + return None, True source_ds_name = dep.name source_ds_version = int(dep.version) @@ -68,6 +68,9 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]: # We append all the steps from the original chain to diff, e.g filters, mappers. diff = _append_steps(diff, dc) + if diff.is_empty(): + return None, False + # merging diff and the latest version of dataset return ( datachain.read_dataset(name, latest_version) @@ -79,4 +82,4 @@ def delta_update(dc: "DataChain", name: str) -> Optional["DataChain"]: modified=False, ) .union(diff) - ) + ), True diff --git a/src/datachain/lib/dc/datachain.py b/src/datachain/lib/dc/datachain.py index d0490dd33..003830773 100644 --- a/src/datachain/lib/dc/datachain.py +++ b/src/datachain/lib/dc/datachain.py @@ -291,7 +291,7 @@ def _evolve( _sys = self._sys return type(self)( query, settings, signal_schema=signal_schema, setup=self._setup, _sys=_sys - ) + ).as_delta(self.delta) def settings( self, @@ -497,7 +497,7 @@ def save( # type: ignore[override] description: Optional[str] = None, attrs: Optional[list[str]] = None, **kwargs, - ) -> "Self": + ) -> "DataChain": """Save to a Dataset. It returns the chain itself. Parameters: @@ -523,13 +523,24 @@ def save( # type: ignore[override] """ schema = self.signals_schema.clone_without_sys_signals().serialize() if self.delta and name: - delta_ds = delta_update(self, name) + delta_ds, has_changes = delta_update(self, name) + if delta_ds: return self._evolve( query=delta_ds._query.save( name=name, version=version, feature_schema=schema, **kwargs ) ) + + if not has_changes: + # sources have not been changed so new version of resulting dataset + # would be the same as previous one. To avoid duplicating exact + # datasets, we won't create new version of it and we will return + # current latest version instead. + from .datasets import read_dataset + + return read_dataset(name, **kwargs) + return self._evolve( query=self._query.save( name=name, @@ -2208,6 +2219,10 @@ def count(self) -> int: """Return the number of rows in the chain.""" return self._query.count() + def is_empty(self) -> bool: + """Returns True if chain has zero number of rows""" + return not bool(self.count()) + def exec(self) -> "Self": """Execute the chain.""" return self._evolve(query=self._query.exec()) diff --git a/src/datachain/lib/dc/storage.py b/src/datachain/lib/dc/storage.py index 9814d1f89..20599e450 100644 --- a/src/datachain/lib/dc/storage.py +++ b/src/datachain/lib/dc/storage.py @@ -138,7 +138,7 @@ def read_storage( ) continue - dc = read_dataset(list_ds_name, session=session, settings=settings, delta=delta) + dc = read_dataset(list_ds_name, session=session, settings=settings) dc._query.update = update dc.signals_schema = dc.signals_schema.mutate({f"{column}": file_type}) @@ -184,4 +184,4 @@ def lst_fn(ds_name, lst_uri): assert storage_chain is not None - return storage_chain + return storage_chain.as_delta(delta) diff --git a/tests/func/test_delta.py b/tests/func/test_delta.py index e14944694..28da6e3a4 100644 --- a/tests/func/test_delta.py +++ b/tests/func/test_delta.py @@ -6,6 +6,7 @@ import datachain as dc from datachain import func +from datachain.error import DatasetVersionNotFoundError from datachain.lib.dc import C from datachain.lib.file import File, ImageFile @@ -182,24 +183,19 @@ def get_index(file: File) -> int: create_delta_dataset() create_delta_dataset() - assert ( - list( - dc.read_dataset(ds_name, version=1) - .order_by("file.path") - .collect("file.path") - ) - == list( - dc.read_dataset(ds_name, version=2) - .order_by("file.path") - .collect("file.path") - ) - == [ - "images/img6.jpg", - "images/img7.jpg", - "images/img8.jpg", - "images/img9.jpg", - ] - ) + assert list( + dc.read_dataset(ds_name, version=1).order_by("file.path").collect("file.path") + ) == [ + "images/img6.jpg", + "images/img7.jpg", + "images/img8.jpg", + "images/img9.jpg", + ] + + with pytest.raises(DatasetVersionNotFoundError) as exc_info: + dc.read_dataset(ds_name, version=2) + + assert str(exc_info.value) == f"Dataset {ds_name} does not have version 2" def test_delta_update_no_file_signals(test_session): diff --git a/tests/unit/lib/test_datachain.py b/tests/unit/lib/test_datachain.py index ddcb8d72c..2e1568aef 100644 --- a/tests/unit/lib/test_datachain.py +++ b/tests/unit/lib/test_datachain.py @@ -274,6 +274,10 @@ def test_read_record_empty_chain_without_schema(test_session): ) +def test_is_empty(test_session): + assert dc.read_records([], schema=None, session=test_session).is_empty() is True + + def test_empty_chain_skip_udf_run(test_session): # Test that UDF is not called for empty chain with patch.object(UDFAdapter, "run") as mock_udf_run: From c670e33e48ad032dd7aaa0b1ce23aa76d0f61633 Mon Sep 17 00:00:00 2001 From: ilongin Date: Mon, 28 Apr 2025 12:24:47 +0200 Subject: [PATCH 32/45] adding diff persist to avoid re-calculation of diff and removing obsolete delta comments --- src/datachain/delta.py | 2 ++ src/datachain/lib/dc/datachain.py | 14 -------------- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/src/datachain/delta.py b/src/datachain/delta.py index a9244e014..70647b260 100644 --- a/src/datachain/delta.py +++ b/src/datachain/delta.py @@ -71,6 +71,8 @@ def delta_update(dc: "DataChain", name: str) -> tuple[Optional["DataChain"], boo if diff.is_empty(): return None, False + diff = diff.persist() + # merging diff and the latest version of dataset return ( datachain.read_dataset(name, latest_version) diff --git a/src/datachain/lib/dc/datachain.py b/src/datachain/lib/dc/datachain.py index 003830773..27a7b415f 100644 --- a/src/datachain/lib/dc/datachain.py +++ b/src/datachain/lib/dc/datachain.py @@ -506,20 +506,6 @@ def save( # type: ignore[override] description : description of a dataset. attrs : attributes of a dataset. They can be without value, e.g "NLP", or with a value, e.g "location=US". - delta : If True, we optimize on creation of the new dataset versions - by calculating diff between source and the last version of dataset - and applying all needed modifications (mappers, filters etc.) only - on that diff. - Then we merge modified diff with the last version of dataset to - create new version. This way we avoid applying modifications to all - records from source every time since that can be expensive operation. - Source can be cloud storage or other dataset which has File object - in schema. - Diff is calculated using `DataChain.diff()` method which looks into - File `source` and `path` for matching, and File `version` and `etag` - for checking if the record is changed. - Note that this takes in account only added and changed records in - source while deleted records are not removed in the new dataset version. """ schema = self.signals_schema.clone_without_sys_signals().serialize() if self.delta and name: From 773b22d0de63851448cf58e593d45d7fec089bb2 Mon Sep 17 00:00:00 2001 From: ilongin Date: Mon, 28 Apr 2025 12:29:38 +0200 Subject: [PATCH 33/45] adding count after persist to avoid re-calculating diff twice --- src/datachain/delta.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/datachain/delta.py b/src/datachain/delta.py index 70647b260..f5124edc3 100644 --- a/src/datachain/delta.py +++ b/src/datachain/delta.py @@ -68,11 +68,12 @@ def delta_update(dc: "DataChain", name: str) -> tuple[Optional["DataChain"], boo # We append all the steps from the original chain to diff, e.g filters, mappers. diff = _append_steps(diff, dc) + # to avoid re-calculating diff multiple times + diff = diff.persist() + if diff.is_empty(): return None, False - diff = diff.persist() - # merging diff and the latest version of dataset return ( datachain.read_dataset(name, latest_version) From e8de5f29a767a1af0c4b354a8c674e3311682e77 Mon Sep 17 00:00:00 2001 From: ilongin Date: Mon, 28 Apr 2025 13:19:44 +0200 Subject: [PATCH 34/45] moving ad_delta to private and fixing delta docs --- src/datachain/lib/dc/datachain.py | 4 ++-- src/datachain/lib/dc/datasets.py | 32 +++++++++++++++--------------- src/datachain/lib/dc/storage.py | 33 ++++++++++++++++--------------- 3 files changed, 35 insertions(+), 34 deletions(-) diff --git a/src/datachain/lib/dc/datachain.py b/src/datachain/lib/dc/datachain.py index 27a7b415f..6f8a3dc40 100644 --- a/src/datachain/lib/dc/datachain.py +++ b/src/datachain/lib/dc/datachain.py @@ -202,7 +202,7 @@ def __repr__(self) -> str: self.print_schema(file=file) return file.getvalue() - def as_delta(self, delta: bool = False) -> "Self": + def _as_delta(self, delta: bool = False) -> "Self": """Marks this chain as delta, which means special delta process will be called on saving dataset for optimization""" self._delta = delta @@ -291,7 +291,7 @@ def _evolve( _sys = self._sys return type(self)( query, settings, signal_schema=signal_schema, setup=self._setup, _sys=_sys - ).as_delta(self.delta) + )._as_delta(self.delta) def settings( self, diff --git a/src/datachain/lib/dc/datasets.py b/src/datachain/lib/dc/datasets.py index 12b228b95..a82e92ab8 100644 --- a/src/datachain/lib/dc/datasets.py +++ b/src/datachain/lib/dc/datasets.py @@ -38,21 +38,21 @@ def read_dataset( settings : Settings to use for the chain. fallback_to_studio : Try to pull dataset from Studio if not found locally. Default is True. - delta : If True, we optimize on creation of the new dataset versions - by calculating diff between last version of this dataset and the version - with which last version of resulting chain dataset (the one specified in - `.save()`) was created. - We then run the "diff" chain with this diff data returned instead of - all dataset data, and we union that diff chain with last version of - resulting dataset creating new version of it. - This way we avoid applying modifications to all records from dataset - every time since that can be expensive operation. - Dataset needs to have File object in schema. - Diff is calculated using `DataChain.diff()` method which looks into - File `source` and `path` for matching, and File `version` and `etag` - for checking if the record is changed. - Note that this takes in account only added and changed records in - dataset while deleted records are not removed in the new dataset version. + delta: If True, we optimize the creation of new dataset versions by calculating + the diff between the latest version of this dataset and the version used + to create the most recent version of the resulting chain dataset (the one + specified in .save()). + We then run the "diff" chain using only the diff data, instead of the + entire dataset, and merge that diff chain with the latest version of the + resulting dataset to create a new version. + This approach avoids modifying all records in the dataset every time, + which can be an expensive operation. + The dataset schema must include a File object. + The diff is calculated using the DataChain.diff() method, which compares + the source and path fields of File objects to find matches, and checks + the version and etag fields to determine if a record has changed. + Note that this process only accounts for added and modified records in + the dataset. Deleted records are not removed in the new dataset version. Example: ```py @@ -108,7 +108,7 @@ def read_dataset( signals_schema |= SignalSchema.deserialize(query.feature_schema) else: signals_schema |= SignalSchema.from_column_types(query.column_types or {}) - return DataChain(query, _settings, signals_schema).as_delta(delta) + return DataChain(query, _settings, signals_schema)._as_delta(delta) def datasets( diff --git a/src/datachain/lib/dc/storage.py b/src/datachain/lib/dc/storage.py index 20599e450..743d1d012 100644 --- a/src/datachain/lib/dc/storage.py +++ b/src/datachain/lib/dc/storage.py @@ -48,21 +48,22 @@ def read_storage( update : force storage reindexing. Default is False. anon : If True, we will treat cloud bucket as public one client_config : Optional client configuration for the storage client. - delta : If True, we optimize on creation of the new dataset versions - by calculating diff between last version of this storage and the version - with which last version of resulting chain dataset (the one specified in - `.save()`) was created. - We then run the "diff" chain with this diff data returned instead of - all storage data, and we union that diff chain with last version of - resulting dataset creating new version of it. - This way we avoid applying modifications to all records from storage - every time since that can be expensive operation. - Dataset needs to have File object in schema. - Diff is calculated using `DataChain.diff()` method which looks into - File `source` and `path` for matching, and File `version` and `etag` - for checking if the record is changed. - Note that this takes in account only added and changed records in - storage while deleted records are not removed in the new dataset version. + delta: If True, we optimize the creation of new dataset versions by calculating + the diff between the latest version of this storage and the version used to + create the most recent version of the resulting chain dataset (the one + specified in .save()). + We then run the "diff" chain using only the diff data, rather than the + entire storage data, and merge that diff chain with the latest version + of the resulting dataset to create a new version. + This approach avoids applying modifications to all records from storage + every time, which can be an expensive operation. + The dataset schema must include a File object. + The diff is calculated using the DataChain.diff() method, which compares + the source and path fields of File objects to find matches, and checks the + version and etag fields to determine if a record has changed. + Note that this process only considers added and modified records in + storage. + Deleted records are not removed from the new dataset version. Returns: DataChain: A DataChain object containing the file information. @@ -184,4 +185,4 @@ def lst_fn(ds_name, lst_uri): assert storage_chain is not None - return storage_chain.as_delta(delta) + return storage_chain._as_delta(delta) From e8d6f2dfd147c259c0ba54ac7d88ca0aec542c60 Mon Sep 17 00:00:00 2001 From: ilongin Date: Mon, 28 Apr 2025 13:47:17 +0200 Subject: [PATCH 35/45] removing not reachable codebase --- src/datachain/delta.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/datachain/delta.py b/src/datachain/delta.py index f5124edc3..9b2b61802 100644 --- a/src/datachain/delta.py +++ b/src/datachain/delta.py @@ -43,10 +43,6 @@ def delta_update(dc: "DataChain", name: str) -> tuple[Optional["DataChain"], boo dependencies = catalog.get_dataset_dependencies( name, latest_version, indirect=False ) - if len(dependencies) > 1: - raise Exception( - "Cannot do delta with dataset that has multiple direct dependencies" - ) dep = dependencies[0] if not dep: From 803345ae88355295d371e162059cfe4029ca8e61 Mon Sep 17 00:00:00 2001 From: ilongin Date: Mon, 28 Apr 2025 15:19:18 +0200 Subject: [PATCH 36/45] fixing lint issue --- src/datachain/lib/dc/storage.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/datachain/lib/dc/storage.py b/src/datachain/lib/dc/storage.py index 743d1d012..0722a0486 100644 --- a/src/datachain/lib/dc/storage.py +++ b/src/datachain/lib/dc/storage.py @@ -1,4 +1,5 @@ import os.path +from functools import reduce from typing import ( TYPE_CHECKING, Optional, @@ -122,7 +123,7 @@ def read_storage( if not uris: raise ValueError("No URIs provided") - storage_chain = None + chains = [] listed_ds_name = set() file_values = [] @@ -166,11 +167,11 @@ def lst_fn(ds_name, lst_uri): lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri) ) - chain = ls(dc, list_path, recursive=recursive, column=column) - - storage_chain = storage_chain.union(chain) if storage_chain else chain # type: ignore[attr-defined] + chains.append(ls(dc, list_path, recursive=recursive, column=column)) listed_ds_name.add(list_ds_name) + storage_chain = None if not chains else reduce(lambda x, y: x.union(y), chains) + if file_values: file_chain = read_values( session=session, From 08a4c1bea6fdcc5a383aaee51ad4e23bf17b78a5 Mon Sep 17 00:00:00 2001 From: ilongin Date: Mon, 28 Apr 2025 15:46:14 +0200 Subject: [PATCH 37/45] added test to check num of processing calls --- tests/func/test_delta.py | 44 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/tests/func/test_delta.py b/tests/func/test_delta.py index 28da6e3a4..b6cbca081 100644 --- a/tests/func/test_delta.py +++ b/tests/func/test_delta.py @@ -153,6 +153,50 @@ def get_index(file: File) -> int: ) +def test_delta_update_check_num_calls(test_session, tmp_dir, tmp_path, capsys): + ds_name = "delta_ds" + path = tmp_dir.as_uri() + tmp_dir = tmp_dir / "images" + os.mkdir(tmp_dir) + map_print = "In map" + + images = [ + { + "name": f"img{i}.jpg", + "data": Image.new(mode="RGB", size=((i + 1) * 10, (i + 1) * 10)), + } + for i in range(20) + ] + + # save only half of the images for now + for img in images[:10]: + img["data"].save(tmp_dir / img["name"]) + + def create_delta_dataset(): + def get_index(file: File) -> int: + print(map_print) # needed to count number of map calls + r = r".+\/img(\d+)\.jpg" + return int(re.search(r, file.path).group(1)) # type: ignore[union-attr] + + ( + dc.read_storage(path, update=True, session=test_session, delta=True) + .map(index=get_index) + .save(ds_name) + ) + + # first version of delta dataset + create_delta_dataset() + # save other half of images + for img in images[10:]: + img["data"].save(tmp_dir / img["name"]) + # second version of delta dataset + create_delta_dataset() + + captured = capsys.readouterr() + # assert captured.out == "Garbage collecting 2 tables.\n" + assert captured.out == "\n".join([map_print] * 20) + "\n" + + def test_delta_update_no_diff(test_session, tmp_dir, tmp_path): ds_name = "delta_ds" path = tmp_dir.as_uri() From b0470ce3206782015de3528edd187c16a8e30c6f Mon Sep 17 00:00:00 2001 From: ilongin Date: Mon, 28 Apr 2025 16:08:55 +0200 Subject: [PATCH 38/45] adding schema to diff instead of appending --- src/datachain/delta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datachain/delta.py b/src/datachain/delta.py index 9b2b61802..8a3855b29 100644 --- a/src/datachain/delta.py +++ b/src/datachain/delta.py @@ -13,7 +13,7 @@ def _append_steps(dc: "DataChain", other: "DataChain"): """ dc = dc.clone() dc._query.steps += other._query.steps.copy() - dc.signals_schema = dc.signals_schema.append(other.signals_schema) + dc.signals_schema = other.signals_schema return dc From 2ab17590385c1aad14249e991ae70e3a2fb2b43c Mon Sep 17 00:00:00 2001 From: ilongin Date: Tue, 6 May 2025 11:39:03 +0200 Subject: [PATCH 39/45] moving delta_disabled to delta.py --- src/datachain/delta.py | 30 +++++++++++++++++++++++++++++- src/datachain/lib/dc/datachain.py | 25 ++----------------------- 2 files changed, 31 insertions(+), 24 deletions(-) diff --git a/src/datachain/delta.py b/src/datachain/delta.py index 8a3855b29..fc0a7be90 100644 --- a/src/datachain/delta.py +++ b/src/datachain/delta.py @@ -1,11 +1,39 @@ -from typing import TYPE_CHECKING, Optional +from functools import wraps +from typing import TYPE_CHECKING, Callable, Optional, TypeVar import datachain from datachain.error import DatasetNotFoundError if TYPE_CHECKING: + from typing_extensions import Concatenate, ParamSpec + from datachain.lib.dc import DataChain + P = ParamSpec("P") + + +T = TypeVar("T", bound="DataChain") + + +def delta_disabled( + method: "Callable[Concatenate[T, P], T]", +) -> "Callable[Concatenate[T, P], T]": + """ + Decorator for disabling DataChain methods (e.g `.agg()` or `.union()`) to + work with delta updates. It throws `NotImplementedError` if chain on which + method is called is marked as delta. + """ + + @wraps(method) + def _inner(self: T, *args: "P.args", **kwargs: "P.kwargs") -> T: + if self.delta: + raise NotImplementedError( + f"Delta update cannot be used with {method.__name__}" + ) + return method(self, *args, **kwargs) + + return _inner + def _append_steps(dc: "DataChain", other: "DataChain"): """Returns cloned chain with appended steps from other chain. diff --git a/src/datachain/lib/dc/datachain.py b/src/datachain/lib/dc/datachain.py index 6f8a3dc40..d4b3440a2 100644 --- a/src/datachain/lib/dc/datachain.py +++ b/src/datachain/lib/dc/datachain.py @@ -4,7 +4,6 @@ import sys import warnings from collections.abc import Iterator, Sequence -from functools import wraps from typing import ( IO, TYPE_CHECKING, @@ -25,7 +24,7 @@ from tqdm import tqdm from datachain.dataset import DatasetRecord -from datachain.delta import delta_update +from datachain.delta import delta_disabled, delta_update from datachain.func import literal from datachain.func.base import Function from datachain.func.func import Func @@ -68,7 +67,7 @@ if TYPE_CHECKING: import pandas as pd - from typing_extensions import Concatenate, ParamSpec, Self + from typing_extensions import ParamSpec, Self P = ParamSpec("P") @@ -76,26 +75,6 @@ T = TypeVar("T", bound="DataChain") -def delta_disabled( - method: "Callable[Concatenate[T, P], T]", -) -> "Callable[Concatenate[T, P], T]": - """ - Decorator for disabling DataChain methods (e.g `.agg()` or `.union()`) to - work with delta updates. It throws `NotImplementedError` if chain on which - method is called is marked as delta. - """ - - @wraps(method) - def _inner(self: T, *args: "P.args", **kwargs: "P.kwargs") -> T: - if self.delta: - raise NotImplementedError( - f"Delta update cannot be used with {method.__name__}" - ) - return method(self, *args, **kwargs) - - return _inner - - class DataChain: """DataChain - a data structure for batch data processing and evaluation. From 594ef7da2e245674e27a756bce81164fd13e2e4d Mon Sep 17 00:00:00 2001 From: ilongin Date: Tue, 6 May 2025 12:26:08 +0200 Subject: [PATCH 40/45] moved is_empty to property empty --- src/datachain/delta.py | 2 +- src/datachain/lib/dc/datachain.py | 10 ++++++---- tests/unit/lib/test_datachain.py | 4 ++-- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/datachain/delta.py b/src/datachain/delta.py index fc0a7be90..efead5740 100644 --- a/src/datachain/delta.py +++ b/src/datachain/delta.py @@ -95,7 +95,7 @@ def delta_update(dc: "DataChain", name: str) -> tuple[Optional["DataChain"], boo # to avoid re-calculating diff multiple times diff = diff.persist() - if diff.is_empty(): + if diff.empty: return None, False # merging diff and the latest version of dataset diff --git a/src/datachain/lib/dc/datachain.py b/src/datachain/lib/dc/datachain.py index d4b3440a2..c8ddf1a37 100644 --- a/src/datachain/lib/dc/datachain.py +++ b/src/datachain/lib/dc/datachain.py @@ -187,8 +187,14 @@ def _as_delta(self, delta: bool = False) -> "Self": self._delta = delta return self + @property + def empty(self) -> bool: + """Returns True if chain has zero number of rows""" + return not bool(self.count()) + @property def delta(self) -> bool: + """Returns True if this chain is ran in "delta" update mode""" return self._delta @property @@ -2184,10 +2190,6 @@ def count(self) -> int: """Return the number of rows in the chain.""" return self._query.count() - def is_empty(self) -> bool: - """Returns True if chain has zero number of rows""" - return not bool(self.count()) - def exec(self) -> "Self": """Execute the chain.""" return self._evolve(query=self._query.exec()) diff --git a/tests/unit/lib/test_datachain.py b/tests/unit/lib/test_datachain.py index b511e8533..4af272925 100644 --- a/tests/unit/lib/test_datachain.py +++ b/tests/unit/lib/test_datachain.py @@ -274,8 +274,8 @@ def test_read_record_empty_chain_without_schema(test_session): ) -def test_is_empty(test_session): - assert dc.read_records([], schema=None, session=test_session).is_empty() is True +def test_empty(test_session): + assert dc.read_records([], schema=None, session=test_session).empty is True def test_empty_chain_skip_udf_run(test_session): From 567d63f347e40610fd8abe4645013c39dfcdf408 Mon Sep 17 00:00:00 2001 From: ilongin Date: Thu, 8 May 2025 02:16:54 +0200 Subject: [PATCH 41/45] adding custom fields to calculate diff in delta update --- src/datachain/delta.py | 27 ++++----- src/datachain/diff/__init__.py | 6 +- src/datachain/lib/dc/datachain.py | 34 ++++++++++-- src/datachain/lib/dc/datasets.py | 55 ++++++++++++------ src/datachain/lib/dc/storage.py | 52 +++++++++++------ tests/func/test_delta.py | 92 +++++++++++++++++++------------ 6 files changed, 177 insertions(+), 89 deletions(-) diff --git a/src/datachain/delta.py b/src/datachain/delta.py index efead5740..05c03505d 100644 --- a/src/datachain/delta.py +++ b/src/datachain/delta.py @@ -1,5 +1,6 @@ +from collections.abc import Sequence from functools import wraps -from typing import TYPE_CHECKING, Callable, Optional, TypeVar +from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union import datachain from datachain.error import DatasetNotFoundError @@ -45,7 +46,13 @@ def _append_steps(dc: "DataChain", other: "DataChain"): return dc -def delta_update(dc: "DataChain", name: str) -> tuple[Optional["DataChain"], bool]: +def delta_update( + dc: "DataChain", + name: str, + on: Union[str, Sequence[str]], + right_on: Optional[Union[str, Sequence[str]]] = None, + compare: Optional[Union[str, Sequence[str]]] = None, +) -> tuple[Optional["DataChain"], bool]: """ Creates new chain that consists of the last version of current delta dataset plus diff from the source with all needed modifications. @@ -58,10 +65,6 @@ def delta_update(dc: "DataChain", name: str) -> tuple[Optional["DataChain"], boo catalog = dc.session.catalog dc._query.apply_listing_pre_step() - chain_file_signal = dc.signals_schema.get_file_signal() - if not chain_file_signal: - raise ValueError("Chain doesn't produce file signal, cannot do delta update") - try: latest_version = catalog.get_dataset(name).latest_version except DatasetNotFoundError: @@ -84,11 +87,8 @@ def delta_update(dc: "DataChain", name: str) -> tuple[Optional["DataChain"], boo source_dc = datachain.read_dataset(source_ds_name, source_ds_version) source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version) - source_file_signal = source_dc.signals_schema.get_file_signal() - if not source_file_signal: - raise ValueError("Source dataset doesn't have file signals") - diff = source_dc_latest.diff(source_dc, on=source_file_signal) + diff = source_dc_latest.compare(source_dc, on=on, compare=compare) # We append all the steps from the original chain to diff, e.g filters, mappers. diff = _append_steps(diff, dc) @@ -101,10 +101,11 @@ def delta_update(dc: "DataChain", name: str) -> tuple[Optional["DataChain"], boo # merging diff and the latest version of dataset return ( datachain.read_dataset(name, latest_version) - .diff( + .compare( diff, - on=chain_file_signal, - right_on=source_file_signal, + on=on, + compare=compare, + right_on=right_on, added=True, modified=False, ) diff --git a/src/datachain/diff/__init__.py b/src/datachain/diff/__init__.py index 161c72f34..93451a66d 100644 --- a/src/datachain/diff/__init__.py +++ b/src/datachain/diff/__init__.py @@ -77,14 +77,16 @@ def _to_list(obj: Optional[Union[str, Sequence[str]]]) -> Optional[list[str]]: cols_select = list(left.signals_schema.clone_without_sys_signals().values.keys()) # getting correct on and right_on column names + on_ = on on = left.signals_schema.resolve(*on).db_signals() # type: ignore[assignment] - right_on = right.signals_schema.resolve(*(right_on or on)).db_signals() # type: ignore[assignment] + right_on = right.signals_schema.resolve(*(right_on or on_)).db_signals() # type: ignore[assignment] # getting correct compare and right_compare column names if they are defined if compare: + compare_ = compare compare = left.signals_schema.resolve(*compare).db_signals() # type: ignore[assignment] right_compare = right.signals_schema.resolve( - *(right_compare or compare) + *(right_compare or compare_) ).db_signals() # type: ignore[assignment] elif not compare and len(cols) != len(right_cols): # here we will mark all rows that are not added or deleted as modified since diff --git a/src/datachain/lib/dc/datachain.py b/src/datachain/lib/dc/datachain.py index c8ddf1a37..6f2a4c05a 100644 --- a/src/datachain/lib/dc/datachain.py +++ b/src/datachain/lib/dc/datachain.py @@ -181,10 +181,20 @@ def __repr__(self) -> str: self.print_schema(file=file) return file.getvalue() - def _as_delta(self, delta: bool = False) -> "Self": + def _as_delta( + self, + on: Optional[Union[str, Sequence[str]]] = None, + right_on: Optional[Union[str, Sequence[str]]] = None, + compare: Optional[Union[str, Sequence[str]]] = None, + ) -> "Self": """Marks this chain as delta, which means special delta process will be called on saving dataset for optimization""" - self._delta = delta + if on is None: + raise ValueError("'delta on' fields must be defined") + self._delta = True + self._delta_on = on + self._delta_right_on = right_on + self._delta_compare = compare return self @property @@ -274,9 +284,17 @@ def _evolve( signal_schema = copy.deepcopy(self.signals_schema) if _sys is None: _sys = self._sys - return type(self)( + chain = type(self)( query, settings, signal_schema=signal_schema, setup=self._setup, _sys=_sys - )._as_delta(self.delta) + ) + if self.delta: + chain = chain._as_delta( + on=self._delta_on, + right_on=self._delta_right_on, + compare=self._delta_compare, + ) + + return chain def settings( self, @@ -494,7 +512,13 @@ def save( # type: ignore[override] """ schema = self.signals_schema.clone_without_sys_signals().serialize() if self.delta and name: - delta_ds, has_changes = delta_update(self, name) + delta_ds, has_changes = delta_update( + self, + name, + on=self._delta_on, + right_on=self._delta_right_on, + compare=self._delta_compare, + ) if delta_ds: return self._evolve( diff --git a/src/datachain/lib/dc/datasets.py b/src/datachain/lib/dc/datasets.py index a82e92ab8..073a0eacd 100644 --- a/src/datachain/lib/dc/datasets.py +++ b/src/datachain/lib/dc/datasets.py @@ -1,4 +1,5 @@ -from typing import TYPE_CHECKING, Optional, get_origin, get_type_hints +from collections.abc import Sequence +from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints from datachain.lib.dataset_info import DatasetInfo from datachain.lib.file import ( @@ -27,6 +28,10 @@ def read_dataset( settings: Optional[dict] = None, fallback_to_studio: bool = True, delta: bool = False, + delta_on: Optional[Union[str, Sequence[str]]] = None, + delta_right_on: Optional[Union[str, Sequence[str]]] = None, + delta_compare: Optional[Union[str, Sequence[str]]] = None, + # delta_right_compare: Optional[Union[str, Sequence[str]]] = None, ) -> "DataChain": """Get data from a saved Dataset. It returns the chain itself. If dataset or version is not found locally, it will try to pull it from Studio. @@ -38,21 +43,32 @@ def read_dataset( settings : Settings to use for the chain. fallback_to_studio : Try to pull dataset from Studio if not found locally. Default is True. - delta: If True, we optimize the creation of new dataset versions by calculating - the diff between the latest version of this dataset and the version used - to create the most recent version of the resulting chain dataset (the one - specified in .save()). - We then run the "diff" chain using only the diff data, instead of the - entire dataset, and merge that diff chain with the latest version of the - resulting dataset to create a new version. - This approach avoids modifying all records in the dataset every time, - which can be an expensive operation. - The dataset schema must include a File object. - The diff is calculated using the DataChain.diff() method, which compares - the source and path fields of File objects to find matches, and checks - the version and etag fields to determine if a record has changed. - Note that this process only accounts for added and modified records in - the dataset. Deleted records are not removed in the new dataset version. + delta: If set to True, we optimize the creation of new dataset versions by + calculating the diff between the latest version of this storage and the + version used to create the most recent version of the resulting chain + dataset (the one specified in `.save()`). We then run the "diff" chain + using only the diff data, rather than the entire storage data, and merge + that diff chain with the latest version of the resulting dataset to create + a new version. This approach avoids applying modifications to all records + from storage every time, which can be an expensive operation. + The diff is calculated using the `DataChain.compare()` method, which + compares the `delta_on` fields to find matches and checks the compare + fields to determine if a record has changed. Note that this process only + considers added and modified records in storage; deleted records are not + removed from the new dataset version. + This calculation is based on the difference between the current version + of the source and the version used to create the dataset. + delta_on: A list of fields that uniquely identify rows in the source. + If two rows have the same values, they are considered the same (e.g., they + could be different versions of the same row in a versioned source). + This is used in the delta update to calculate the diff. + delta_right_on: A list of fields in the final dataset that correspond to the + `delta_on` fields if they were renamed. + There is no need to define this if the fields from `delta_on` are present + in the final dataset. + delta_compare: A list of fields used to check if the same row has been modified + in the new version of the source. + If not defined, all fields except those defined in delta_on will be used. Example: ```py @@ -108,7 +124,12 @@ def read_dataset( signals_schema |= SignalSchema.deserialize(query.feature_schema) else: signals_schema |= SignalSchema.from_column_types(query.column_types or {}) - return DataChain(query, _settings, signals_schema)._as_delta(delta) + chain = DataChain(query, _settings, signals_schema) + if delta: + chain = chain._as_delta( + on=delta_on, right_on=delta_right_on, compare=delta_compare + ) + return chain def datasets( diff --git a/src/datachain/lib/dc/storage.py b/src/datachain/lib/dc/storage.py index 0722a0486..196de3140 100644 --- a/src/datachain/lib/dc/storage.py +++ b/src/datachain/lib/dc/storage.py @@ -1,4 +1,5 @@ import os.path +from collections.abc import Sequence from functools import reduce from typing import ( TYPE_CHECKING, @@ -34,6 +35,9 @@ def read_storage( update: bool = False, anon: bool = False, delta: bool = False, + delta_on: Optional[Union[str, Sequence[str]]] = None, + delta_right_on: Optional[Union[str, Sequence[str]]] = None, + delta_compare: Optional[Union[str, Sequence[str]]] = None, client_config: Optional[dict] = None, ) -> "DataChain": """Get data from storage(s) as a list of file with all file attributes. @@ -49,22 +53,32 @@ def read_storage( update : force storage reindexing. Default is False. anon : If True, we will treat cloud bucket as public one client_config : Optional client configuration for the storage client. - delta: If True, we optimize the creation of new dataset versions by calculating - the diff between the latest version of this storage and the version used to - create the most recent version of the resulting chain dataset (the one - specified in .save()). - We then run the "diff" chain using only the diff data, rather than the - entire storage data, and merge that diff chain with the latest version - of the resulting dataset to create a new version. - This approach avoids applying modifications to all records from storage - every time, which can be an expensive operation. - The dataset schema must include a File object. - The diff is calculated using the DataChain.diff() method, which compares - the source and path fields of File objects to find matches, and checks the - version and etag fields to determine if a record has changed. - Note that this process only considers added and modified records in - storage. - Deleted records are not removed from the new dataset version. + delta: If set to True, we optimize the creation of new dataset versions by + calculating the diff between the latest version of this storage and the + version used to create the most recent version of the resulting chain + dataset (the one specified in `.save()`). We then run the "diff" chain + using only the diff data, rather than the entire storage data, and merge + that diff chain with the latest version of the resulting dataset to create + a new version. This approach avoids applying modifications to all records + from storage every time, which can be an expensive operation. + The diff is calculated using the `DataChain.compare()` method, which + compares the `delta_on` fields to find matches and checks the compare + fields to determine if a record has changed. Note that this process only + considers added and modified records in storage; deleted records are not + removed from the new dataset version. + This calculation is based on the difference between the current version + of the source and the version used to create the dataset. + delta_on: A list of fields that uniquely identify rows in the source. + If two rows have the same values, they are considered the same (e.g., they + could be different versions of the same row in a versioned source). + This is used in the delta update to calculate the diff. + delta_right_on: A list of fields in the final dataset that correspond to the + `delta_on` fields if they were renamed. + There is no need to define this if the fields from `delta_on` are present + in the final dataset. + delta_compare: A list of fields used to check if the same row has been modified + in the new version of the source. + If not defined, all fields except those defined in delta_on will be used. Returns: DataChain: A DataChain object containing the file information. @@ -186,4 +200,8 @@ def lst_fn(ds_name, lst_uri): assert storage_chain is not None - return storage_chain._as_delta(delta) + if delta: + storage_chain = storage_chain._as_delta( + on=delta_on, right_on=delta_right_on, compare=delta_compare + ) + return storage_chain diff --git a/tests/func/test_delta.py b/tests/func/test_delta.py index b6cbca081..9af749c10 100644 --- a/tests/func/test_delta.py +++ b/tests/func/test_delta.py @@ -32,9 +32,13 @@ def create_image_dataset(ds_name, images): ).save(ds_name) def create_delta_dataset(ds_name): - dc.read_dataset(starting_ds_name, session=test_session, delta=True).save( - ds_name - ) + dc.read_dataset( + starting_ds_name, + session=test_session, + delta=True, + delta_on=["file.source", "file.path"], + delta_compare=["file.version", "file.etag"], + ).save(ds_name) # first version of starting dataset create_image_dataset(starting_ds_name, images[:2]) @@ -89,7 +93,14 @@ def get_index(file: File) -> int: return int(re.search(r, file.path).group(1)) # type: ignore[union-attr] ( - dc.read_storage(path, update=True, session=test_session, delta=True) + dc.read_storage( + path, + update=True, + session=test_session, + delta=True, + delta_on=["file.source", "file.path"], + delta_compare=["file.version", "file.etag"], + ) .filter(C("file.path").glob("*.jpg")) .map(emb=my_embedding) .mutate(dist=func.cosine_distance("emb", (0.1, 0.2))) @@ -179,7 +190,14 @@ def get_index(file: File) -> int: return int(re.search(r, file.path).group(1)) # type: ignore[union-attr] ( - dc.read_storage(path, update=True, session=test_session, delta=True) + dc.read_storage( + path, + update=True, + session=test_session, + delta=True, + delta_on=["file.source", "file.path"], + delta_compare=["file.version", "file.etag"], + ) .map(index=get_index) .save(ds_name) ) @@ -217,7 +235,14 @@ def get_index(file: File) -> int: return int(re.search(r, file.path).group(1)) # type: ignore[union-attr] ( - dc.read_storage(path, update=True, session=test_session, delta=True) + dc.read_storage( + path, + update=True, + session=test_session, + delta=True, + delta_on=["file.source", "file.path"], + delta_compare=["file.version", "file.etag"], + ) .filter(C("file.path").glob("*.jpg")) .map(index=get_index) .filter(C("index") > 5) @@ -242,24 +267,6 @@ def get_index(file: File) -> int: assert str(exc_info.value) == f"Dataset {ds_name} does not have version 2" -def test_delta_update_no_file_signals(test_session): - starting_ds_name = "starting_ds" - - dc.read_values(num=[10, 20], session=test_session).save(starting_ds_name) - - with pytest.raises(ValueError) as excinfo: - dc.read_dataset( - starting_ds_name, - session=test_session, - delta=True, - ).save("delta_ds") - - assert ( - str(excinfo.value) - == "Chain doesn't produce file signal, cannot do delta update" - ) - - @pytest.fixture def file_dataset(test_session): return dc.read_values( @@ -276,9 +283,12 @@ def test_delta_update_union(test_session, file_dataset): with pytest.raises(NotImplementedError) as excinfo: ( - dc.read_dataset(file_dataset.name, session=test_session, delta=True).union( - dc.read_dataset("numbers"), session=test_session - ) + dc.read_dataset( + file_dataset.name, + session=test_session, + delta=True, + delta_on=["file.source", "file.path"], + ).union(dc.read_dataset("numbers"), session=test_session) ) assert str(excinfo.value) == "Delta update cannot be used with union" @@ -289,9 +299,12 @@ def test_delta_update_merge(test_session, file_dataset): with pytest.raises(NotImplementedError) as excinfo: ( - dc.read_dataset(file_dataset.name, session=test_session, delta=True).merge( - dc.read_dataset("numbers"), on="id", session=test_session - ) + dc.read_dataset( + file_dataset.name, + session=test_session, + delta=True, + delta_on=["file.source", "file.path"], + ).merge(dc.read_dataset("numbers"), on="id", session=test_session) ) assert str(excinfo.value) == "Delta update cannot be used with merge" @@ -301,7 +314,10 @@ def test_delta_update_distinct(test_session, file_dataset): with pytest.raises(NotImplementedError) as excinfo: ( dc.read_dataset( - file_dataset.name, session=test_session, delta=True + file_dataset.name, + session=test_session, + delta=True, + delta_on=["file.source", "file.path"], ).distinct("file.path") ) @@ -312,7 +328,10 @@ def test_delta_update_group_by(test_session, file_dataset): with pytest.raises(NotImplementedError) as excinfo: ( dc.read_dataset( - file_dataset.name, session=test_session, delta=True + file_dataset.name, + session=test_session, + delta=True, + delta_on=["file.source", "file.path"], ).group_by(cnt=func.count(), partition_by="file.path") ) @@ -322,9 +341,12 @@ def test_delta_update_group_by(test_session, file_dataset): def test_delta_update_agg(test_session, file_dataset): with pytest.raises(NotImplementedError) as excinfo: ( - dc.read_dataset(file_dataset.name, session=test_session, delta=True).agg( - cnt=func.count(), partition_by="file.path" - ) + dc.read_dataset( + file_dataset.name, + session=test_session, + delta=True, + delta_on=["file.source", "file.path"], + ).agg(cnt=func.count(), partition_by="file.path") ) assert str(excinfo.value) == "Delta update cannot be used with agg" From e1f60c79c41894774410a72af47e4caca02db76c Mon Sep 17 00:00:00 2001 From: ilongin Date: Mon, 12 May 2025 10:36:42 +0200 Subject: [PATCH 42/45] fixing semver --- src/datachain/delta.py | 2 +- tests/func/test_delta.py | 28 +++++++++++++++++++--------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/datachain/delta.py b/src/datachain/delta.py index 05c03505d..3772af783 100644 --- a/src/datachain/delta.py +++ b/src/datachain/delta.py @@ -82,7 +82,7 @@ def delta_update( return None, True source_ds_name = dep.name - source_ds_version = int(dep.version) + source_ds_version = dep.version source_ds_latest_version = catalog.get_dataset(source_ds_name).latest_version source_dc = datachain.read_dataset(source_ds_name, source_ds_version) diff --git a/tests/func/test_delta.py b/tests/func/test_delta.py index 9af749c10..81199da7c 100644 --- a/tests/func/test_delta.py +++ b/tests/func/test_delta.py @@ -50,14 +50,18 @@ def create_delta_dataset(ds_name): create_delta_dataset(ds_name) assert list( - dc.read_dataset(ds_name, version=1).order_by("file.path").collect("file.path") + dc.read_dataset(ds_name, version="1.0.0") + .order_by("file.path") + .collect("file.path") ) == [ "img1.jpg", "img2.jpg", ] assert list( - dc.read_dataset(ds_name, version=2).order_by("file.path").collect("file.path") + dc.read_dataset(ds_name, version="1.0.1") + .order_by("file.path") + .collect("file.path") ) == [ "img1.jpg", "img2.jpg", @@ -116,7 +120,7 @@ def get_index(file: File) -> int: # into consideration on delta update etags = { r[0]: r[1].etag - for r in dc.read_dataset(ds_name, version=1).collect("index", "file") + for r in dc.read_dataset(ds_name, version="1.0.0").collect("index", "file") } # remove last couple of images to simulate modification since we will re-create it @@ -131,7 +135,9 @@ def get_index(file: File) -> int: create_delta_dataset() assert list( - dc.read_dataset(ds_name, version=1).order_by("file.path").collect("file.path") + dc.read_dataset(ds_name, version="1.0.0") + .order_by("file.path") + .collect("file.path") ) == [ "images/img4.jpg", "images/img6.jpg", @@ -139,7 +145,9 @@ def get_index(file: File) -> int: ] assert list( - dc.read_dataset(ds_name, version=2).order_by("file.path").collect("file.path") + dc.read_dataset(ds_name, version="1.0.1") + .order_by("file.path") + .collect("file.path") ) == [ "images/img10.jpg", "images/img12.jpg", @@ -155,7 +163,7 @@ def get_index(file: File) -> int: # and modified rows etags should be bigger than the old ones assert ( next( - dc.read_dataset(ds_name, version=2) + dc.read_dataset(ds_name, version="1.0.1") .filter(C("index") == 6) .order_by("file.path", "file.etag") .collect("file.etag") @@ -253,7 +261,9 @@ def get_index(file: File) -> int: create_delta_dataset() assert list( - dc.read_dataset(ds_name, version=1).order_by("file.path").collect("file.path") + dc.read_dataset(ds_name, version="1.0.0") + .order_by("file.path") + .collect("file.path") ) == [ "images/img6.jpg", "images/img7.jpg", @@ -262,9 +272,9 @@ def get_index(file: File) -> int: ] with pytest.raises(DatasetVersionNotFoundError) as exc_info: - dc.read_dataset(ds_name, version=2) + dc.read_dataset(ds_name, version="1.0.1") - assert str(exc_info.value) == f"Dataset {ds_name} does not have version 2" + assert str(exc_info.value) == f"Dataset {ds_name} does not have version 1.0.1" @pytest.fixture From be704a202214900d1f9a55c91777850c15e5344c Mon Sep 17 00:00:00 2001 From: ilongin Date: Mon, 12 May 2025 11:09:24 +0200 Subject: [PATCH 43/45] renamed field --- src/datachain/lib/dc/datachain.py | 6 +++--- src/datachain/lib/dc/datasets.py | 19 +++++++++++-------- src/datachain/lib/dc/storage.py | 18 +++++++++++------- 3 files changed, 25 insertions(+), 18 deletions(-) diff --git a/src/datachain/lib/dc/datachain.py b/src/datachain/lib/dc/datachain.py index 5725f3daf..ac5b25149 100644 --- a/src/datachain/lib/dc/datachain.py +++ b/src/datachain/lib/dc/datachain.py @@ -194,7 +194,7 @@ def _as_delta( raise ValueError("'delta on' fields must be defined") self._delta = True self._delta_on = on - self._delta_right_on = right_on + self._delta_result_on = right_on self._delta_compare = compare return self @@ -291,7 +291,7 @@ def _evolve( if self.delta: chain = chain._as_delta( on=self._delta_on, - right_on=self._delta_right_on, + right_on=self._delta_result_on, compare=self._delta_compare, ) @@ -521,7 +521,7 @@ def save( # type: ignore[override] self, name, on=self._delta_on, - right_on=self._delta_right_on, + right_on=self._delta_result_on, compare=self._delta_compare, ) diff --git a/src/datachain/lib/dc/datasets.py b/src/datachain/lib/dc/datasets.py index 05d61ce22..d4a82e513 100644 --- a/src/datachain/lib/dc/datasets.py +++ b/src/datachain/lib/dc/datasets.py @@ -28,11 +28,10 @@ def read_dataset( session: Optional[Session] = None, settings: Optional[dict] = None, fallback_to_studio: bool = True, - delta: bool = False, + delta: Optional[bool] = False, delta_on: Optional[Union[str, Sequence[str]]] = None, - delta_right_on: Optional[Union[str, Sequence[str]]] = None, + delta_result_on: Optional[Union[str, Sequence[str]]] = None, delta_compare: Optional[Union[str, Sequence[str]]] = None, - # delta_right_compare: Optional[Union[str, Sequence[str]]] = None, ) -> "DataChain": """Get data from a saved Dataset. It returns the chain itself. If dataset or version is not found locally, it will try to pull it from Studio. @@ -63,10 +62,14 @@ def read_dataset( If two rows have the same values, they are considered the same (e.g., they could be different versions of the same row in a versioned source). This is used in the delta update to calculate the diff. - delta_right_on: A list of fields in the final dataset that correspond to the - `delta_on` fields if they were renamed. - There is no need to define this if the fields from `delta_on` are present - in the final dataset. + delta_result_on: A list of fields in the resulting dataset that correspond + to the `delta_on` fields from the source. + This is needed to identify rows that have changed in the source but are + already present in the current version of the resulting dataset, in order + to avoid including outdated versions of those rows in the new dataset. + We retain only the latest versions of rows to prevent duplication. + There is no need to define this if the `delta_on` fields are present in + the final dataset and have not been renamed. delta_compare: A list of fields used to check if the same row has been modified in the new version of the source. If not defined, all fields except those defined in delta_on will be used. @@ -148,7 +151,7 @@ def read_dataset( chain = DataChain(query, _settings, signals_schema) if delta: chain = chain._as_delta( - on=delta_on, right_on=delta_right_on, compare=delta_compare + on=delta_on, right_on=delta_result_on, compare=delta_compare ) return chain diff --git a/src/datachain/lib/dc/storage.py b/src/datachain/lib/dc/storage.py index edf8c0427..91d37ce66 100644 --- a/src/datachain/lib/dc/storage.py +++ b/src/datachain/lib/dc/storage.py @@ -35,9 +35,9 @@ def read_storage( column: str = "file", update: bool = False, anon: bool = False, - delta: bool = False, + delta: Optional[bool] = False, delta_on: Optional[Union[str, Sequence[str]]] = None, - delta_right_on: Optional[Union[str, Sequence[str]]] = None, + delta_result_on: Optional[Union[str, Sequence[str]]] = None, delta_compare: Optional[Union[str, Sequence[str]]] = None, client_config: Optional[dict] = None, ) -> "DataChain": @@ -73,10 +73,14 @@ def read_storage( If two rows have the same values, they are considered the same (e.g., they could be different versions of the same row in a versioned source). This is used in the delta update to calculate the diff. - delta_right_on: A list of fields in the final dataset that correspond to the - `delta_on` fields if they were renamed. - There is no need to define this if the fields from `delta_on` are present - in the final dataset. + delta_result_on: A list of fields in the resulting dataset that correspond + to the `delta_on` fields from the source. + This is needed to identify rows that have changed in the source but are + already present in the current version of the resulting dataset, in order + to avoid including outdated versions of those rows in the new dataset. + We retain only the latest versions of rows to prevent duplication. + There is no need to define this if the `delta_on` fields are present in + the final dataset and have not been renamed. delta_compare: A list of fields used to check if the same row has been modified in the new version of the source. If not defined, all fields except those defined in delta_on will be used. @@ -210,6 +214,6 @@ def lst_fn(ds_name, lst_uri): if delta: storage_chain = storage_chain._as_delta( - on=delta_on, right_on=delta_right_on, compare=delta_compare + on=delta_on, right_on=delta_result_on, compare=delta_compare ) return storage_chain From 5decfeb576b142e31c24008542e6a44e9f27aed0 Mon Sep 17 00:00:00 2001 From: ilongin Date: Tue, 13 May 2025 17:29:41 +0200 Subject: [PATCH 44/45] fixing dataset dependencies in delta update --- src/datachain/delta.py | 23 ++++++++++++++--------- src/datachain/lib/dc/datachain.py | 8 ++++++-- src/datachain/query/dataset.py | 6 +++++- tests/func/test_delta.py | 17 +++++++++++++++++ 4 files changed, 42 insertions(+), 12 deletions(-) diff --git a/src/datachain/delta.py b/src/datachain/delta.py index 3772af783..4293e03c4 100644 --- a/src/datachain/delta.py +++ b/src/datachain/delta.py @@ -1,8 +1,10 @@ from collections.abc import Sequence +from copy import copy from functools import wraps from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union import datachain +from datachain.dataset import DatasetDependency from datachain.error import DatasetNotFoundError if TYPE_CHECKING: @@ -52,7 +54,7 @@ def delta_update( on: Union[str, Sequence[str]], right_on: Optional[Union[str, Sequence[str]]] = None, compare: Optional[Union[str, Sequence[str]]] = None, -) -> tuple[Optional["DataChain"], bool]: +) -> tuple[Optional["DataChain"], Optional[list[DatasetDependency]], bool]: """ Creates new chain that consists of the last version of current delta dataset plus diff from the source with all needed modifications. @@ -69,7 +71,7 @@ def delta_update( latest_version = catalog.get_dataset(name).latest_version except DatasetNotFoundError: # first creation of delta update dataset - return None, True + return None, None, True dependencies = catalog.get_dataset_dependencies( name, latest_version, indirect=False @@ -79,11 +81,14 @@ def delta_update( if not dep: # starting dataset (e.g listing) was removed so we are backing off to normal # dataset creation, as it was created first time - return None, True + return None, None, True source_ds_name = dep.name source_ds_version = dep.version source_ds_latest_version = catalog.get_dataset(source_ds_name).latest_version + dependencies = copy(dependencies) + dependencies = [d for d in dependencies if d is not None] # filter out removed dep + dependencies[0].version = source_ds_latest_version # type: ignore[union-attr] source_dc = datachain.read_dataset(source_ds_name, source_ds_version) source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version) @@ -96,18 +101,18 @@ def delta_update( diff = diff.persist() if diff.empty: - return None, False + return None, None, False # merging diff and the latest version of dataset - return ( + delta_chain = ( datachain.read_dataset(name, latest_version) .compare( diff, - on=on, - compare=compare, - right_on=right_on, + on=right_on or on, added=True, modified=False, ) .union(diff) - ), True + ) + + return delta_chain, dependencies, True # type: ignore[return-value] diff --git a/src/datachain/lib/dc/datachain.py b/src/datachain/lib/dc/datachain.py index ac5b25149..668f19155 100644 --- a/src/datachain/lib/dc/datachain.py +++ b/src/datachain/lib/dc/datachain.py @@ -517,7 +517,7 @@ def save( # type: ignore[override] schema = self.signals_schema.clone_without_sys_signals().serialize() if self.delta and name: - delta_ds, has_changes = delta_update( + delta_ds, dependencies, has_changes = delta_update( self, name, on=self._delta_on, @@ -528,7 +528,11 @@ def save( # type: ignore[override] if delta_ds: return self._evolve( query=delta_ds._query.save( - name=name, version=version, feature_schema=schema, **kwargs + name=name, + version=version, + feature_schema=schema, + dependencies=dependencies, + **kwargs, ) ) diff --git a/src/datachain/query/dataset.py b/src/datachain/query/dataset.py index 3bddee636..10194b3c4 100644 --- a/src/datachain/query/dataset.py +++ b/src/datachain/query/dataset.py @@ -41,7 +41,7 @@ partition_col_names, partition_columns, ) -from datachain.dataset import DATASET_PREFIX, DatasetStatus, RowDict +from datachain.dataset import DATASET_PREFIX, DatasetDependency, DatasetStatus, RowDict from datachain.error import DatasetNotFoundError, QueryScriptCancelError from datachain.func.base import Function from datachain.lib.listing import is_listing_dataset, listing_dataset_expired @@ -1698,6 +1698,7 @@ def save( name: Optional[str] = None, version: Optional[str] = None, feature_schema: Optional[dict] = None, + dependencies: Optional[list[DatasetDependency]] = None, description: Optional[str] = None, attrs: Optional[list[str]] = None, **kwargs, @@ -1751,6 +1752,9 @@ def save( ) self.catalog.update_dataset_version_with_warehouse_info(dataset, version) + if dependencies: + # overriding dependencies + self.dependencies = {(dep.name, dep.version) for dep in dependencies} self._add_dependencies(dataset, version) # type: ignore [arg-type] finally: self.cleanup() diff --git a/tests/func/test_delta.py b/tests/func/test_delta.py index 81199da7c..80c013e63 100644 --- a/tests/func/test_delta.py +++ b/tests/func/test_delta.py @@ -11,7 +11,17 @@ from datachain.lib.file import File, ImageFile +def _get_dependencies(catalog, name, version) -> list[tuple[str, str]]: + return sorted( + [ + (d.name, d.version) + for d in catalog.get_dataset_dependencies(name, version, indirect=False) + ] + ) + + def test_delta_update_from_dataset(test_session, tmp_dir, tmp_path): + catalog = test_session.catalog starting_ds_name = "starting_ds" ds_name = "delta_ds" @@ -37,6 +47,7 @@ def create_delta_dataset(ds_name): session=test_session, delta=True, delta_on=["file.source", "file.path"], + delta_result_on=["file.source", "file.path"], delta_compare=["file.version", "file.etag"], ).save(ds_name) @@ -44,10 +55,12 @@ def create_delta_dataset(ds_name): create_image_dataset(starting_ds_name, images[:2]) # first version of delta dataset create_delta_dataset(ds_name) + assert _get_dependencies(catalog, ds_name, "1.0.0") == [(starting_ds_name, "1.0.0")] # second version of starting dataset create_image_dataset(starting_ds_name, images[2:]) # second version of delta dataset create_delta_dataset(ds_name) + assert _get_dependencies(catalog, ds_name, "1.0.1") == [(starting_ds_name, "1.0.1")] assert list( dc.read_dataset(ds_name, version="1.0.0") @@ -69,6 +82,8 @@ def create_delta_dataset(ds_name): "img4.jpg", ] + create_delta_dataset(ds_name) + def test_delta_update_from_storage(test_session, tmp_dir, tmp_path): ds_name = "delta_ds" @@ -103,6 +118,7 @@ def get_index(file: File) -> int: session=test_session, delta=True, delta_on=["file.source", "file.path"], + delta_result_on=["file.source", "file.path"], delta_compare=["file.version", "file.etag"], ) .filter(C("file.path").glob("*.jpg")) @@ -204,6 +220,7 @@ def get_index(file: File) -> int: session=test_session, delta=True, delta_on=["file.source", "file.path"], + delta_result_on=["file.source", "file.path"], delta_compare=["file.version", "file.etag"], ) .map(index=get_index) From ab9f9a30e1abd8a95edd42002f77e2940db102d3 Mon Sep 17 00:00:00 2001 From: ilongin Date: Wed, 14 May 2025 16:44:47 +0200 Subject: [PATCH 45/45] fixing small issues with deleted --- src/datachain/delta.py | 3 ++- src/datachain/lib/dc/storage.py | 2 +- tests/func/test_delta.py | 4 ++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/datachain/delta.py b/src/datachain/delta.py index 4293e03c4..22465c25c 100644 --- a/src/datachain/delta.py +++ b/src/datachain/delta.py @@ -93,7 +93,7 @@ def delta_update( source_dc = datachain.read_dataset(source_ds_name, source_ds_version) source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version) - diff = source_dc_latest.compare(source_dc, on=on, compare=compare) + diff = source_dc_latest.compare(source_dc, on=on, compare=compare, deleted=False) # We append all the steps from the original chain to diff, e.g filters, mappers. diff = _append_steps(diff, dc) @@ -111,6 +111,7 @@ def delta_update( on=right_on or on, added=True, modified=False, + deleted=False, ) .union(diff) ) diff --git a/src/datachain/lib/dc/storage.py b/src/datachain/lib/dc/storage.py index 91d37ce66..827180e68 100644 --- a/src/datachain/lib/dc/storage.py +++ b/src/datachain/lib/dc/storage.py @@ -83,7 +83,7 @@ def read_storage( the final dataset and have not been renamed. delta_compare: A list of fields used to check if the same row has been modified in the new version of the source. - If not defined, all fields except those defined in delta_on will be used. + If not defined, all fields except those defined in `delta_on` will be used. Returns: DataChain: A DataChain object containing the file information. diff --git a/tests/func/test_delta.py b/tests/func/test_delta.py index 80c013e63..9d6525525 100644 --- a/tests/func/test_delta.py +++ b/tests/func/test_delta.py @@ -147,6 +147,10 @@ def get_index(file: File) -> int: for img in images[5:]: img["data"].save(tmp_dir / img["name"]) + # remove first 5 images to check that deleted rows are not taken into consideration + for img in images[0:5]: + os.remove(tmp_dir / img["name"]) + # second version of delta dataset create_delta_dataset()