From 78999e0b345cfd90a5a7564da5dd2d93e225c4f3 Mon Sep 17 00:00:00 2001 From: "Thorvald M. Ballestad" Date: Fri, 9 Aug 2024 10:55:54 +0200 Subject: [PATCH 01/18] Dict based approach --- src/dvc_data/index/diff.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/src/dvc_data/index/diff.py b/src/dvc_data/index/diff.py index 8637a4a2..c64f2d20 100644 --- a/src/dvc_data/index/diff.py +++ b/src/dvc_data/index/diff.py @@ -239,7 +239,6 @@ def _diff( # noqa: C901 yield Change(typ, old_entry, new_entry) - def _detect_renames(changes: Iterable[Change]): added = [] deleted = [] @@ -252,11 +251,10 @@ def _detect_renames(changes: Iterable[Change]): else: yield change - def _get_key(change): - return change.key + # Create a dictionary for fast lookup of deletions by hash_info + deleted_dict = {ch.old.hash_info: ch for ch in deleted if ch.old and ch.old.hash_info} - added[:] = sorted(added, key=_get_key) - deleted[:] = sorted(deleted, key=_get_key) + unmatched_deleted = set(deleted_dict.keys()) for change in added: new_entry = change.new @@ -266,24 +264,21 @@ def _get_key(change): yield change continue - index, old_entry = None, None - for idx, ch in enumerate(deleted): - assert ch.old - if ch.old.hash_info == new_entry.hash_info: - index, old_entry = idx, ch.old - break + old_entry = deleted_dict.get(new_entry.hash_info) - if index is not None: - del deleted[index] + if old_entry: + unmatched_deleted.remove(new_entry.hash_info) yield Change( RENAME, - old_entry, + old_entry.old, new_entry, ) else: yield change - yield from deleted + # Yield the remaining unmatched deletions + for hash_info in unmatched_deleted: + yield deleted_dict[hash_info] def diff( # noqa: PLR0913 From 93046b46a104994279efe4ab50f028adc5a21917 Mon Sep 17 00:00:00 2001 From: "Thorvald M. Ballestad" Date: Fri, 9 Aug 2024 14:04:51 +0200 Subject: [PATCH 02/18] Respect multiple changes having the same hash --- src/dvc_data/index/diff.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/dvc_data/index/diff.py b/src/dvc_data/index/diff.py index c64f2d20..a3d3f25f 100644 --- a/src/dvc_data/index/diff.py +++ b/src/dvc_data/index/diff.py @@ -1,5 +1,6 @@ -from collections import deque +from collections import defaultdict, deque from collections.abc import Iterable +import itertools from typing import TYPE_CHECKING, Any, Callable, Optional from attrs import define @@ -252,9 +253,9 @@ def _detect_renames(changes: Iterable[Change]): yield change # Create a dictionary for fast lookup of deletions by hash_info - deleted_dict = {ch.old.hash_info: ch for ch in deleted if ch.old and ch.old.hash_info} - - unmatched_deleted = set(deleted_dict.keys()) + deleted_dict = defaultdict(set) + for change in deleted: + deleted_dict[change.old.hash_info].add(change) for change in added: new_entry = change.new @@ -264,21 +265,24 @@ def _detect_renames(changes: Iterable[Change]): yield change continue - old_entry = deleted_dict.get(new_entry.hash_info) + # If the new entry is the same as a delted change, + # it is in fact a rename. + # Note: get instead of __getitem__, to avoid creating + # unnecessary entries. + if deleted_dict.get(new_entry.hash_info): + deletion = deleted_dict[new_entry.hash_info].pop() - if old_entry: - unmatched_deleted.remove(new_entry.hash_info) yield Change( RENAME, - old_entry.old, + deletion.old, new_entry, ) else: yield change # Yield the remaining unmatched deletions - for hash_info in unmatched_deleted: - yield deleted_dict[hash_info] + if deleted_dict: + yield from set.union(*deleted_dict.values()) def diff( # noqa: PLR0913 From 6129eb040a97d03ba04e873573e6a72fdc1e4d73 Mon Sep 17 00:00:00 2001 From: "Thorvald M. Ballestad" Date: Fri, 9 Aug 2024 15:17:42 +0200 Subject: [PATCH 03/18] Make renames sorted and deterministic --- src/dvc_data/index/diff.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/dvc_data/index/diff.py b/src/dvc_data/index/diff.py index a3d3f25f..da2bfa0f 100644 --- a/src/dvc_data/index/diff.py +++ b/src/dvc_data/index/diff.py @@ -252,10 +252,17 @@ def _detect_renames(changes: Iterable[Change]): else: yield change + def _get_key(change): + return change.key + + added.sort(key=_get_key) + deleted.sort(key=_get_key, reverse=True) + + # Create a dictionary for fast lookup of deletions by hash_info - deleted_dict = defaultdict(set) + deleted_dict = defaultdict(list) for change in deleted: - deleted_dict[change.old.hash_info].add(change) + deleted_dict[change.old.hash_info].append(change) for change in added: new_entry = change.new @@ -265,7 +272,7 @@ def _detect_renames(changes: Iterable[Change]): yield change continue - # If the new entry is the same as a delted change, + # If the new entry is the same as a deleted change, # it is in fact a rename. # Note: get instead of __getitem__, to avoid creating # unnecessary entries. @@ -282,7 +289,7 @@ def _detect_renames(changes: Iterable[Change]): # Yield the remaining unmatched deletions if deleted_dict: - yield from set.union(*deleted_dict.values()) + yield from itertools.chain.from_iterable(deleted_dict.values()) def diff( # noqa: PLR0913 From 4d9bb59fa76078b5e6ba58919ef942e2a3074644 Mon Sep 17 00:00:00 2001 From: "Thorvald M. Ballestad" Date: Fri, 9 Aug 2024 15:19:21 +0200 Subject: [PATCH 04/18] test diff of non unique hashes --- tests/index/test_diff.py | 51 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/tests/index/test_diff.py b/tests/index/test_diff.py index 9a3fae2a..814a511c 100644 --- a/tests/index/test_diff.py +++ b/tests/index/test_diff.py @@ -60,6 +60,57 @@ def test_diff(): } +def test_diff_non_unique_hash(): + """Test rename when multiple entries share the same hash.""" + def entry(key): + return DataIndexEntry( + key=key, + meta=Meta(), + hash_info=HashInfo(name="md5", value="d3b07384d113edec49eaa6238ad5ff00"), + ) + + old_foo_entry = entry(("foo",)) + old_bar_entry = entry(("bar",)) + old_baz_entry = entry(("baz",)) + old = DataIndex({ + ("foo",): old_foo_entry, + ("bar",): old_bar_entry, + ("baz",): old_baz_entry, + }) + + assert set(diff(old, old, with_unchanged=True)) == { + Change(UNCHANGED, old_foo_entry, old_foo_entry), + Change(UNCHANGED, old_bar_entry, old_bar_entry), + Change(UNCHANGED, old_baz_entry, old_baz_entry), + } + assert set(diff(old, old, with_renames=True, with_unchanged=True)) == { + Change(UNCHANGED, old_foo_entry, old_foo_entry), + Change(UNCHANGED, old_bar_entry, old_bar_entry), + Change(UNCHANGED, old_baz_entry, old_baz_entry), + } + + new_foo_entry = entry(("my","new", "foo",)) + new_bar_entry = entry(("new", "bar",)) + new = DataIndex({ + ("my", "new", "foo",): new_foo_entry, + ("new", "bar",): new_bar_entry, + ("baz",): old_baz_entry, + }) + + assert set(diff(old, new, with_unchanged=True)) == { + Change(ADD, None, new_foo_entry), + Change(DELETE, old_foo_entry, None), + Change(ADD, None, new_bar_entry), + Change(DELETE, old_bar_entry, None), + Change(UNCHANGED, old_baz_entry, old_baz_entry), + } + assert set(diff(old, new, with_renames=True, with_unchanged=True)) == { + Change(RENAME, old_foo_entry, new_foo_entry), + Change(RENAME, old_bar_entry, new_bar_entry), + Change(UNCHANGED, old_baz_entry, old_baz_entry), + } + + def test_diff_no_hashes(): index = DataIndex( { From 9b3de444b3eb5b172fb1b4ee3543bce97198035e Mon Sep 17 00:00:00 2001 From: "Thorvald M. Ballestad" Date: Fri, 9 Aug 2024 15:31:19 +0200 Subject: [PATCH 05/18] Simplify tests --- tests/index/test_diff.py | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/tests/index/test_diff.py b/tests/index/test_diff.py index 814a511c..a2aea22b 100644 --- a/tests/index/test_diff.py +++ b/tests/index/test_diff.py @@ -70,44 +70,32 @@ def entry(key): ) old_foo_entry = entry(("foo",)) - old_bar_entry = entry(("bar",)) - old_baz_entry = entry(("baz",)) old = DataIndex({ ("foo",): old_foo_entry, - ("bar",): old_bar_entry, - ("baz",): old_baz_entry, }) assert set(diff(old, old, with_unchanged=True)) == { Change(UNCHANGED, old_foo_entry, old_foo_entry), - Change(UNCHANGED, old_bar_entry, old_bar_entry), - Change(UNCHANGED, old_baz_entry, old_baz_entry), } assert set(diff(old, old, with_renames=True, with_unchanged=True)) == { Change(UNCHANGED, old_foo_entry, old_foo_entry), - Change(UNCHANGED, old_bar_entry, old_bar_entry), - Change(UNCHANGED, old_baz_entry, old_baz_entry), } - new_foo_entry = entry(("my","new", "foo",)) - new_bar_entry = entry(("new", "bar",)) + new_foo_1 = entry(("a/foo.txt",)) + new_foo_2 = entry(("foo.md",)) new = DataIndex({ - ("my", "new", "foo",): new_foo_entry, - ("new", "bar",): new_bar_entry, - ("baz",): old_baz_entry, + new_foo_1.key: new_foo_1, + new_foo_2.key: new_foo_2, }) assert set(diff(old, new, with_unchanged=True)) == { - Change(ADD, None, new_foo_entry), + Change(ADD, None, new_foo_1), + Change(ADD, None, new_foo_2), Change(DELETE, old_foo_entry, None), - Change(ADD, None, new_bar_entry), - Change(DELETE, old_bar_entry, None), - Change(UNCHANGED, old_baz_entry, old_baz_entry), } assert set(diff(old, new, with_renames=True, with_unchanged=True)) == { - Change(RENAME, old_foo_entry, new_foo_entry), - Change(RENAME, old_bar_entry, new_bar_entry), - Change(UNCHANGED, old_baz_entry, old_baz_entry), + Change(RENAME, old_foo_entry, new_foo_1), + Change(ADD, None, new_foo_2), } From cccf5e0039735a97eb1a6b1a3e463a1a5390c448 Mon Sep 17 00:00:00 2001 From: "Thorvald M. Ballestad" Date: Fri, 9 Aug 2024 16:02:47 +0200 Subject: [PATCH 06/18] Make test case illustrate edge case --- tests/index/test_diff.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/tests/index/test_diff.py b/tests/index/test_diff.py index a2aea22b..bdfaee80 100644 --- a/tests/index/test_diff.py +++ b/tests/index/test_diff.py @@ -70,32 +70,37 @@ def entry(key): ) old_foo_entry = entry(("foo",)) + old_bar_entry = entry(("bar",)) old = DataIndex({ - ("foo",): old_foo_entry, + old_foo_entry.key: old_foo_entry, + old_bar_entry.key: old_bar_entry, }) assert set(diff(old, old, with_unchanged=True)) == { Change(UNCHANGED, old_foo_entry, old_foo_entry), + Change(UNCHANGED, old_bar_entry, old_bar_entry), } assert set(diff(old, old, with_renames=True, with_unchanged=True)) == { Change(UNCHANGED, old_foo_entry, old_foo_entry), + Change(UNCHANGED, old_bar_entry, old_bar_entry), } - new_foo_1 = entry(("a/foo.txt",)) - new_foo_2 = entry(("foo.md",)) + new_foo_entry = entry(("foo.txt",)) + new_bar_entry = entry(("zab", "bar",)) new = DataIndex({ - new_foo_1.key: new_foo_1, - new_foo_2.key: new_foo_2, + new_foo_entry.key: new_foo_entry, + new_bar_entry.key: new_bar_entry, }) assert set(diff(old, new, with_unchanged=True)) == { - Change(ADD, None, new_foo_1), - Change(ADD, None, new_foo_2), + Change(ADD, None, new_foo_entry), + Change(ADD, None, new_bar_entry), Change(DELETE, old_foo_entry, None), + Change(DELETE, old_bar_entry, None), } assert set(diff(old, new, with_renames=True, with_unchanged=True)) == { - Change(RENAME, old_foo_entry, new_foo_1), - Change(ADD, None, new_foo_2), + Change(RENAME, old_foo_entry, new_foo_entry), + Change(RENAME, old_bar_entry, new_bar_entry), } From fb8aee0d94272d588fe5675381238ba3916214e0 Mon Sep 17 00:00:00 2001 From: "Thorvald M. Ballestad" Date: Fri, 9 Aug 2024 16:04:45 +0200 Subject: [PATCH 07/18] Fix import order --- src/dvc_data/index/diff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dvc_data/index/diff.py b/src/dvc_data/index/diff.py index da2bfa0f..eeff8d6a 100644 --- a/src/dvc_data/index/diff.py +++ b/src/dvc_data/index/diff.py @@ -1,6 +1,6 @@ +import itertools from collections import defaultdict, deque from collections.abc import Iterable -import itertools from typing import TYPE_CHECKING, Any, Callable, Optional from attrs import define From f61fc23c4438881de1316ed3b3c02e54a6134ecb Mon Sep 17 00:00:00 2001 From: "Thorvald M. Ballestad" Date: Fri, 9 Aug 2024 17:28:32 +0200 Subject: [PATCH 08/18] Apply formatting --- src/dvc_data/index/diff.py | 2 +- tests/index/test_diff.py | 55 +++++++++++++++++--------------------- 2 files changed, 25 insertions(+), 32 deletions(-) diff --git a/src/dvc_data/index/diff.py b/src/dvc_data/index/diff.py index eeff8d6a..d1fbb500 100644 --- a/src/dvc_data/index/diff.py +++ b/src/dvc_data/index/diff.py @@ -240,6 +240,7 @@ def _diff( # noqa: C901 yield Change(typ, old_entry, new_entry) + def _detect_renames(changes: Iterable[Change]): added = [] deleted = [] @@ -258,7 +259,6 @@ def _get_key(change): added.sort(key=_get_key) deleted.sort(key=_get_key, reverse=True) - # Create a dictionary for fast lookup of deletions by hash_info deleted_dict = defaultdict(list) for change in deleted: diff --git a/tests/index/test_diff.py b/tests/index/test_diff.py index bdfaee80..d497973f 100644 --- a/tests/index/test_diff.py +++ b/tests/index/test_diff.py @@ -61,48 +61,41 @@ def test_diff(): def test_diff_non_unique_hash(): - """Test rename when multiple entries share the same hash.""" - def entry(key): + """Test renaming behavior when multiple entries share the same hash.""" + + def create_entry(key): return DataIndexEntry( key=key, meta=Meta(), hash_info=HashInfo(name="md5", value="d3b07384d113edec49eaa6238ad5ff00"), ) - old_foo_entry = entry(("foo",)) - old_bar_entry = entry(("bar",)) - old = DataIndex({ - old_foo_entry.key: old_foo_entry, - old_bar_entry.key: old_bar_entry, - }) - - assert set(diff(old, old, with_unchanged=True)) == { - Change(UNCHANGED, old_foo_entry, old_foo_entry), - Change(UNCHANGED, old_bar_entry, old_bar_entry), - } - assert set(diff(old, old, with_renames=True, with_unchanged=True)) == { - Change(UNCHANGED, old_foo_entry, old_foo_entry), - Change(UNCHANGED, old_bar_entry, old_bar_entry), - } + initial_entries = [create_entry(("foo",)), create_entry(("bar",))] + intermediate_entries = [create_entry(("foo.txt",)), create_entry(("bar",))] + final_entries = [create_entry(("foo.txt",)), create_entry(("zab", "bar"))] - new_foo_entry = entry(("foo.txt",)) - new_bar_entry = entry(("zab", "bar",)) - new = DataIndex({ - new_foo_entry.key: new_foo_entry, - new_bar_entry.key: new_bar_entry, - }) + initial = DataIndex({entry.key: entry for entry in initial_entries}) + intermediate = DataIndex({entry.key: entry for entry in intermediate_entries}) + final = DataIndex({entry.key: entry for entry in final_entries}) - assert set(diff(old, new, with_unchanged=True)) == { - Change(ADD, None, new_foo_entry), - Change(ADD, None, new_bar_entry), - Change(DELETE, old_foo_entry, None), - Change(DELETE, old_bar_entry, None), + expected_initial_intermediate_diff = { + Change(RENAME, initial[("foo",)], intermediate[("foo.txt",)]), + Change(UNCHANGED, initial[("bar",)], initial[("bar",)]), } - assert set(diff(old, new, with_renames=True, with_unchanged=True)) == { - Change(RENAME, old_foo_entry, new_foo_entry), - Change(RENAME, old_bar_entry, new_bar_entry), + expected_initial_final_diff = { + Change(RENAME, initial[("foo",)], final[("foo.txt",)]), + Change(RENAME, initial[("bar",)], final[("zab", "bar")]), } + assert ( + set(diff(initial, intermediate, with_renames=True, with_unchanged=True)) + == expected_initial_intermediate_diff + ) + assert ( + set(diff(initial, final, with_renames=True, with_unchanged=True)) + == expected_initial_final_diff + ) + def test_diff_no_hashes(): index = DataIndex( From 01504e112142ce17001a7d580772573ab378b0a5 Mon Sep 17 00:00:00 2001 From: "Thorvald M. Ballestad" Date: Mon, 12 Aug 2024 09:35:30 +0200 Subject: [PATCH 09/18] Strict assert --- src/dvc_data/index/diff.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/dvc_data/index/diff.py b/src/dvc_data/index/diff.py index d1fbb500..69115699 100644 --- a/src/dvc_data/index/diff.py +++ b/src/dvc_data/index/diff.py @@ -1,7 +1,7 @@ import itertools from collections import defaultdict, deque from collections.abc import Iterable -from typing import TYPE_CHECKING, Any, Callable, Optional +from typing import TYPE_CHECKING, Any, Callable, Optional, cast from attrs import define from fsspec.callbacks import DEFAULT_CALLBACK, Callback @@ -247,8 +247,10 @@ def _detect_renames(changes: Iterable[Change]): for change in changes: if change.typ == ADD: + assert change.new added.append(change) elif change.typ == DELETE: + assert change.old deleted.append(change) else: yield change @@ -256,17 +258,20 @@ def _detect_renames(changes: Iterable[Change]): def _get_key(change): return change.key + # Sort the lists to maintain the same order + # as older implementation. added.sort(key=_get_key) deleted.sort(key=_get_key, reverse=True) # Create a dictionary for fast lookup of deletions by hash_info deleted_dict = defaultdict(list) for change in deleted: - deleted_dict[change.old.hash_info].append(change) + # We checked change.old for all deleted above, so cast + deleted_dict[cast(DataIndexEntry, change.old).hash_info].append(change) for change in added: - new_entry = change.new - assert new_entry + # We checked change.new for all new above, so cast + new_entry = cast(DataIndexEntry, change.new) if not new_entry.hash_info: yield change From 86a3e34556985ee0b898e362d838a3560ae6b14a Mon Sep 17 00:00:00 2001 From: "Thorvald M. Ballestad" Date: Mon, 12 Aug 2024 09:52:01 +0200 Subject: [PATCH 10/18] Use deque instead of list --- src/dvc_data/index/diff.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/dvc_data/index/diff.py b/src/dvc_data/index/diff.py index 69115699..de4ba2a4 100644 --- a/src/dvc_data/index/diff.py +++ b/src/dvc_data/index/diff.py @@ -261,13 +261,15 @@ def _get_key(change): # Sort the lists to maintain the same order # as older implementation. added.sort(key=_get_key) - deleted.sort(key=_get_key, reverse=True) + deleted.sort(key=_get_key) # Create a dictionary for fast lookup of deletions by hash_info - deleted_dict = defaultdict(list) + deleted_dict = defaultdict(deque) for change in deleted: # We checked change.old for all deleted above, so cast - deleted_dict[cast(DataIndexEntry, change.old).hash_info].append(change) + change_hash = cast(DataIndexEntry, change.old).hash_info + # appendleft to get queue behaviour (we pop off right) + deleted_dict[change_hash].appendleft(change) for change in added: # We checked change.new for all new above, so cast From b3a4355da26379925edb46b3b8ed9c0e8d7b0a54 Mon Sep 17 00:00:00 2001 From: "Thorvald M. Ballestad" Date: Mon, 12 Aug 2024 09:52:22 +0200 Subject: [PATCH 11/18] Change rename test output --- tests/index/test_diff.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/index/test_diff.py b/tests/index/test_diff.py index d497973f..2eb619cb 100644 --- a/tests/index/test_diff.py +++ b/tests/index/test_diff.py @@ -83,8 +83,8 @@ def create_entry(key): Change(UNCHANGED, initial[("bar",)], initial[("bar",)]), } expected_initial_final_diff = { - Change(RENAME, initial[("foo",)], final[("foo.txt",)]), - Change(RENAME, initial[("bar",)], final[("zab", "bar")]), + Change(RENAME, initial[("foo",)], final[("zab", "bar")]), + Change(RENAME, initial[("bar",)], final[("foo.txt",)]), } assert ( From 6f497aaa1666002a8f3f9b00fe9ae867e0e16966 Mon Sep 17 00:00:00 2001 From: "Thorvald M. Ballestad" Date: Mon, 12 Aug 2024 10:08:19 +0200 Subject: [PATCH 12/18] Revert test to old --- tests/index/test_diff.py | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/tests/index/test_diff.py b/tests/index/test_diff.py index 2eb619cb..9a3fae2a 100644 --- a/tests/index/test_diff.py +++ b/tests/index/test_diff.py @@ -60,43 +60,6 @@ def test_diff(): } -def test_diff_non_unique_hash(): - """Test renaming behavior when multiple entries share the same hash.""" - - def create_entry(key): - return DataIndexEntry( - key=key, - meta=Meta(), - hash_info=HashInfo(name="md5", value="d3b07384d113edec49eaa6238ad5ff00"), - ) - - initial_entries = [create_entry(("foo",)), create_entry(("bar",))] - intermediate_entries = [create_entry(("foo.txt",)), create_entry(("bar",))] - final_entries = [create_entry(("foo.txt",)), create_entry(("zab", "bar"))] - - initial = DataIndex({entry.key: entry for entry in initial_entries}) - intermediate = DataIndex({entry.key: entry for entry in intermediate_entries}) - final = DataIndex({entry.key: entry for entry in final_entries}) - - expected_initial_intermediate_diff = { - Change(RENAME, initial[("foo",)], intermediate[("foo.txt",)]), - Change(UNCHANGED, initial[("bar",)], initial[("bar",)]), - } - expected_initial_final_diff = { - Change(RENAME, initial[("foo",)], final[("zab", "bar")]), - Change(RENAME, initial[("bar",)], final[("foo.txt",)]), - } - - assert ( - set(diff(initial, intermediate, with_renames=True, with_unchanged=True)) - == expected_initial_intermediate_diff - ) - assert ( - set(diff(initial, final, with_renames=True, with_unchanged=True)) - == expected_initial_final_diff - ) - - def test_diff_no_hashes(): index = DataIndex( { From b145b8a3a375c1d7cb4cd9b70aa933896f0bbda6 Mon Sep 17 00:00:00 2001 From: "Thorvald M. Ballestad" Date: Mon, 12 Aug 2024 10:22:32 +0200 Subject: [PATCH 13/18] Add missing type annotation --- src/dvc_data/index/diff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dvc_data/index/diff.py b/src/dvc_data/index/diff.py index de4ba2a4..a40bd810 100644 --- a/src/dvc_data/index/diff.py +++ b/src/dvc_data/index/diff.py @@ -264,7 +264,7 @@ def _get_key(change): deleted.sort(key=_get_key) # Create a dictionary for fast lookup of deletions by hash_info - deleted_dict = defaultdict(deque) + deleted_dict: dict[HashInfo | None, deque[Change]] = defaultdict(deque) for change in deleted: # We checked change.old for all deleted above, so cast change_hash = cast(DataIndexEntry, change.old).hash_info From 21e0223bc5a9f5ecf1b681840b63c3b843d930f8 Mon Sep 17 00:00:00 2001 From: "Thorvald M. Ballestad" Date: Mon, 12 Aug 2024 10:39:48 +0200 Subject: [PATCH 14/18] Use Optional not X | None --- src/dvc_data/index/diff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dvc_data/index/diff.py b/src/dvc_data/index/diff.py index a40bd810..40fd5eb8 100644 --- a/src/dvc_data/index/diff.py +++ b/src/dvc_data/index/diff.py @@ -264,7 +264,7 @@ def _get_key(change): deleted.sort(key=_get_key) # Create a dictionary for fast lookup of deletions by hash_info - deleted_dict: dict[HashInfo | None, deque[Change]] = defaultdict(deque) + deleted_dict: dict[Optional[HashInfo], deque[Change]] = defaultdict(deque) for change in deleted: # We checked change.old for all deleted above, so cast change_hash = cast(DataIndexEntry, change.old).hash_info From f25eca41108f26a49b2717d307e2c01d19a4d040 Mon Sep 17 00:00:00 2001 From: "Thorvald M. Ballestad" Date: Tue, 13 Aug 2024 11:12:53 +0200 Subject: [PATCH 15/18] handle missing new and old with if --- src/dvc_data/index/diff.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/src/dvc_data/index/diff.py b/src/dvc_data/index/diff.py index 40fd5eb8..7f8e3b19 100644 --- a/src/dvc_data/index/diff.py +++ b/src/dvc_data/index/diff.py @@ -242,15 +242,13 @@ def _diff( # noqa: C901 def _detect_renames(changes: Iterable[Change]): - added = [] - deleted = [] + added: list[Change] = [] + deleted: list[Change] = [] for change in changes: if change.typ == ADD: - assert change.new added.append(change) elif change.typ == DELETE: - assert change.old deleted.append(change) else: yield change @@ -266,30 +264,24 @@ def _get_key(change): # Create a dictionary for fast lookup of deletions by hash_info deleted_dict: dict[Optional[HashInfo], deque[Change]] = defaultdict(deque) for change in deleted: - # We checked change.old for all deleted above, so cast - change_hash = cast(DataIndexEntry, change.old).hash_info + change_hash = change.old.hash_info if change.old else None # appendleft to get queue behaviour (we pop off right) deleted_dict[change_hash].appendleft(change) for change in added: - # We checked change.new for all new above, so cast - new_entry = cast(DataIndexEntry, change.new) - - if not new_entry.hash_info: - yield change - continue + new_hash_info = change.new.hash_info if change.new else None # If the new entry is the same as a deleted change, # it is in fact a rename. # Note: get instead of __getitem__, to avoid creating # unnecessary entries. - if deleted_dict.get(new_entry.hash_info): - deletion = deleted_dict[new_entry.hash_info].pop() + if new_hash_info and (queue := deleted_dict.get(new_hash_info)): + deletion = queue.pop() yield Change( RENAME, deletion.old, - new_entry, + change.new, ) else: yield change From db0407e6a4356cf117d22f68d52c59fa271a4efc Mon Sep 17 00:00:00 2001 From: "Thorvald M. Ballestad" Date: Tue, 13 Aug 2024 11:16:11 +0200 Subject: [PATCH 16/18] rename change to deletion and addition --- src/dvc_data/index/diff.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/dvc_data/index/diff.py b/src/dvc_data/index/diff.py index 7f8e3b19..32a32f3b 100644 --- a/src/dvc_data/index/diff.py +++ b/src/dvc_data/index/diff.py @@ -263,13 +263,13 @@ def _get_key(change): # Create a dictionary for fast lookup of deletions by hash_info deleted_dict: dict[Optional[HashInfo], deque[Change]] = defaultdict(deque) - for change in deleted: - change_hash = change.old.hash_info if change.old else None + for deletion in deleted: + change_hash = deletion.old.hash_info if deletion.old else None # appendleft to get queue behaviour (we pop off right) - deleted_dict[change_hash].appendleft(change) + deleted_dict[change_hash].appendleft(deletion) - for change in added: - new_hash_info = change.new.hash_info if change.new else None + for addition in added: + new_hash_info = addition.new.hash_info if addition.new else None # If the new entry is the same as a deleted change, # it is in fact a rename. @@ -281,10 +281,10 @@ def _get_key(change): yield Change( RENAME, deletion.old, - change.new, + addition.new, ) else: - yield change + yield addition # Yield the remaining unmatched deletions if deleted_dict: From d97809266f93fc5fc105bcf5f61b1927af2026e3 Mon Sep 17 00:00:00 2001 From: "Thorvald M. Ballestad" Date: Tue, 13 Aug 2024 11:25:03 +0200 Subject: [PATCH 17/18] fixup! handle missing new and old with if --- src/dvc_data/index/diff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dvc_data/index/diff.py b/src/dvc_data/index/diff.py index 32a32f3b..1d11e373 100644 --- a/src/dvc_data/index/diff.py +++ b/src/dvc_data/index/diff.py @@ -1,7 +1,7 @@ import itertools from collections import defaultdict, deque from collections.abc import Iterable -from typing import TYPE_CHECKING, Any, Callable, Optional, cast +from typing import TYPE_CHECKING, Any, Callable, Optional from attrs import define from fsspec.callbacks import DEFAULT_CALLBACK, Callback From 9e53ce507b07bdcbd92fa70e1bc3c00d4c903ccd Mon Sep 17 00:00:00 2001 From: "Thorvald M. Ballestad" Date: Tue, 13 Aug 2024 13:57:39 +0200 Subject: [PATCH 18/18] remove comment on sort --- src/dvc_data/index/diff.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/dvc_data/index/diff.py b/src/dvc_data/index/diff.py index 1d11e373..2a776851 100644 --- a/src/dvc_data/index/diff.py +++ b/src/dvc_data/index/diff.py @@ -256,8 +256,6 @@ def _detect_renames(changes: Iterable[Change]): def _get_key(change): return change.key - # Sort the lists to maintain the same order - # as older implementation. added.sort(key=_get_key) deleted.sort(key=_get_key)