From 4ee7420a6e76f52b1f3db773c0d2024b4b196656 Mon Sep 17 00:00:00 2001 From: Quentin ANIERE Date: Mon, 27 Oct 2025 15:33:24 +0000 Subject: [PATCH 1/9] Annex.py: Update hash algorithm and switch to UNIX timestamps Switch from MD5 to SHA3-256 to avoid any risk of collision. MD5 is still supported for retrieving from the annex, pushing is only using SHA3-256 Switch to %c date format to a UNIX timestamp to allow collaboration with people with different date format / timezome. The old format is supported for reading, the push only UNIX timestamp --- lib/rift/Annex.py | 75 ++++++++++++++++++++++++++++++----------------- tests/Annex.py | 13 +++----- 2 files changed, 52 insertions(+), 36 deletions(-) diff --git a/lib/rift/Annex.py b/lib/rift/Annex.py index 49ca08b8..8ace9186 100644 --- a/lib/rift/Annex.py +++ b/lib/rift/Annex.py @@ -90,7 +90,7 @@ def is_binary(filepath, blocksize=65536): def hashfile(filepath, iosize=65536): """Compute a digest of filepath content.""" - hasher = hashlib.md5() + hasher = hashlib.sha3_256() with open(filepath, 'rb') as srcfile: buf = srcfile.read(iosize) while len(buf) > 0: @@ -123,10 +123,13 @@ def is_pointer(cls, filepath): identifier. """ meta = os.stat(filepath) - if meta.st_size == 32: + + # MD5 or SHA3 256 + if meta.st_size == 32 or meta.st_size == 64: with open(filepath, encoding='utf-8') as fh: - identifier = fh.read(32) + identifier = fh.read(meta.st_size) return all(byte in string.hexdigits for byte in identifier) + return False def get(self, identifier, destpath): @@ -223,16 +226,34 @@ def list(self): insertion time. """ for filename in os.listdir(self.path): - if not filename.endswith('.info'): - info = self._load_metadata(filename) - names = info.get('filenames', []) - for annexed_file in names.values(): - insertion_time = annexed_file['date'] - insertion_time = datetime.datetime.strptime(insertion_time, "%c").timestamp() - - #The file size must come from the filesystem + if filename.endswith(_INFOSUFFIX): + continue + + info = self._load_metadata(filename) + names = info.get('filenames', []) + for annexed_file, details in names.items(): + insertion_time = details['date'] + + # Handle different date formats (old method) + if isinstance(insertion_time, str): + for fmt in ('%a %b %d %H:%M:%S %Y', '%a %d %b %Y %H:%M:%S %p %Z'): + try: + insertion_time = datetime.datetime.strptime(insertion_time, fmt).timestamp() + break + except ValueError: + continue +else: + raise ValueError(f"Invalid date format in metadata: {insertion_time}") + + # UNIX timestamp + elif isinstance(insertion_time, int): + insertion_time = insertion_time + else: + raise ValueError(f"Invalid date format in metadata: {type(insertion_time)})") + + # The file size must come from the filesystem meta = os.stat(os.path.join(self.path, filename)) - yield filename, meta.st_size, insertion_time, names + yield filename, meta.st_size, insertion_time, [annexed_file] def push(self, filepath): """ @@ -254,21 +275,21 @@ def push(self, filepath): destinfo = None if os.path.exists(destpath): destinfo = os.stat(destpath) - if destinfo and destinfo.st_size == originfo.st_size and \ - filename in metadata.get('filenames', {}): - logging.debug('%s is already into annex, skipping it', filename) - - else: - # Update them and write them back - fileset = metadata.setdefault('filenames', {}) - fileset.setdefault(filename, {}) - fileset[filename]['date'] = time.strftime("%c") - self._save_metadata(digest, metadata) - - # Move binary file to annex - logging.debug('Importing %s into annex (%s)', filepath, digest) - shutil.copyfile(filepath, destpath) - os.chmod(destpath, self.WMODE) + if destinfo and destinfo.st_size == originfo.st_size and \ + filename in metadata.get('filenames', {}): + logging.debug('%s is already into annex, skipping it', filename) + return + + # Update them and write them back + fileset = metadata.setdefault('filenames', {}) + fileset.setdefault(filename, {}) + fileset[filename]['date'] = time.time() # Unix timestamp + self._save_metadata(digest, metadata) + + # Move binary file to annex + logging.debug('Importing %s into annex (%s)', filepath, digest) + shutil.copyfile(filepath, destpath) + os.chmod(destpath, self.WMODE) # Verify permission are correct before copying os.chmod(filepath, self.RMODE) diff --git a/tests/Annex.py b/tests/Annex.py index 36a51541..5e814dd6 100644 --- a/tests/Annex.py +++ b/tests/Annex.py @@ -192,22 +192,17 @@ def test_delete(self): self.annex.get_by_path(source_file.name, '/dev/null') def test_list(self): - """ Test list method """ + """Test the list method""" source_size = os.stat(self.source.name).st_size - source_insertion_time = datetime.datetime.strptime(time.strftime('%c'), '%c').timestamp() - # Get the current time with the %c format and convert it to unix timestamp to - # have the same method as annex.list (in terms of precision) + source_insertion_time = time.time() self.annex.push(self.source.name) - # Check if the file pointer is present in the annex list output - # by checking it's attributes for filename, size, insertion_time, names in self.annex.list(): self.assertEqual(get_digest_from_path(self.source.name), filename) self.assertEqual(source_size, size) - # As tests can take time to run, accept less or equal 1 second shift - self.assertAlmostEqual(source_insertion_time, insertion_time, delta=1) - self.assertTrue(os.path.basename(self.source.name) in names.keys()) + self.assertAlmostEqual(source_insertion_time, insertion_time, delta=1) # delta for potentials delay + self.assertTrue(os.path.basename(self.source.name) in names) def test_push(self): """ Test push method """ From 5f61aa46bca8a29ef36e77fd8840fee87795aa6e Mon Sep 17 00:00:00 2001 From: Quentin ANIERE Date: Mon, 27 Oct 2025 15:40:28 +0000 Subject: [PATCH 2/9] fix typo --- lib/rift/Annex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rift/Annex.py b/lib/rift/Annex.py index 8ace9186..eb9cd398 100644 --- a/lib/rift/Annex.py +++ b/lib/rift/Annex.py @@ -242,7 +242,7 @@ def list(self): break except ValueError: continue -else: + else: raise ValueError(f"Invalid date format in metadata: {insertion_time}") # UNIX timestamp From e8d0a82d0e7276ba1d0ca90e3ceb4fb0aa9ff89f Mon Sep 17 00:00:00 2001 From: Quentin ANIERE Date: Mon, 27 Oct 2025 15:42:59 +0000 Subject: [PATCH 3/9] fix typo --- lib/rift/Annex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rift/Annex.py b/lib/rift/Annex.py index eb9cd398..f497f80f 100644 --- a/lib/rift/Annex.py +++ b/lib/rift/Annex.py @@ -243,7 +243,7 @@ def list(self): except ValueError: continue else: - raise ValueError(f"Invalid date format in metadata: {insertion_time}") + raise ValueError(f"Invalid date format in metadata: {insertion_time}") # UNIX timestamp elif isinstance(insertion_time, int): From cab20fffcb6e65b67ce6cf2e65989ee5a1600259 Mon Sep 17 00:00:00 2001 From: Quentin ANIERE Date: Mon, 27 Oct 2025 15:48:38 +0000 Subject: [PATCH 4/9] fix indent --- lib/rift/Annex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rift/Annex.py b/lib/rift/Annex.py index f497f80f..a5e7e7fa 100644 --- a/lib/rift/Annex.py +++ b/lib/rift/Annex.py @@ -231,7 +231,7 @@ def list(self): info = self._load_metadata(filename) names = info.get('filenames', []) - for annexed_file, details in names.items(): + for annexed_file, details in names.items(): insertion_time = details['date'] # Handle different date formats (old method) From 59585ea041f3531ba155ed2613cd95138630449a Mon Sep 17 00:00:00 2001 From: Quentin ANIERE Date: Mon, 27 Oct 2025 15:52:52 +0000 Subject: [PATCH 5/9] fix indent --- lib/rift/Annex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rift/Annex.py b/lib/rift/Annex.py index a5e7e7fa..f6e4ba4a 100644 --- a/lib/rift/Annex.py +++ b/lib/rift/Annex.py @@ -242,7 +242,7 @@ def list(self): break except ValueError: continue - else: + else: raise ValueError(f"Invalid date format in metadata: {insertion_time}") # UNIX timestamp From 7c1d6f9c5be3a7282e0a42fd87e6be4e36a7ad50 Mon Sep 17 00:00:00 2001 From: Quentin ANIERE Date: Wed, 29 Oct 2025 14:02:22 +0100 Subject: [PATCH 6/9] fix typo --- lib/rift/Annex.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/lib/rift/Annex.py b/lib/rift/Annex.py index f6e4ba4a..ea8eb910 100644 --- a/lib/rift/Annex.py +++ b/lib/rift/Annex.py @@ -242,9 +242,6 @@ def list(self): break except ValueError: continue - else: - raise ValueError(f"Invalid date format in metadata: {insertion_time}") - # UNIX timestamp elif isinstance(insertion_time, int): insertion_time = insertion_time From 5a1f2e6c50602d367c3bac16f84cf75f3323a041 Mon Sep 17 00:00:00 2001 From: Quentin ANIERE Date: Wed, 29 Oct 2025 16:18:50 +0100 Subject: [PATCH 7/9] fix --- lib/rift/Annex.py | 3 ++- lib/rift/Controller.py | 8 ++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/lib/rift/Annex.py b/lib/rift/Annex.py index ea8eb910..191a5867 100644 --- a/lib/rift/Annex.py +++ b/lib/rift/Annex.py @@ -246,7 +246,8 @@ def list(self): elif isinstance(insertion_time, int): insertion_time = insertion_time else: - raise ValueError(f"Invalid date format in metadata: {type(insertion_time)})") + logging.warning("Unknown time format: %s (type %s)", insertion_time, type(insertion_time)) + insertion_time = insertion_time # The file size must come from the filesystem meta = os.stat(os.path.join(self.path, filename)) diff --git a/lib/rift/Controller.py b/lib/rift/Controller.py index 02684272..c05ecc7a 100644 --- a/lib/rift/Controller.py +++ b/lib/rift/Controller.py @@ -336,8 +336,12 @@ def action_annex(args, config, staff, modules): print(fmt % ('ID', 'SIZE', 'DATE', 'FILENAMES')) print(fmt % ('--', '----', '----', '---------')) for filename, size, mtime, names in annex.list(): - timestr = time.strftime('%x %X', time.localtime(mtime)) - print(fmt % (filename, size, timestr, ','.join(names))) + try: + timestr = time.strftime('%x %X', time.localtime(mtime)) + print(fmt % (filename, size, timestr, ','.join(names))) + + except TypeError: + print(fmt % (filename, size, mtime, ','.join(names))) elif args.annex_cmd == 'push': for srcfile in args.files: From a15f4542394d3974db15e9123b69dfe6f318ff6c Mon Sep 17 00:00:00 2001 From: Quentin ANIERE Date: Wed, 29 Oct 2025 16:24:55 +0100 Subject: [PATCH 8/9] fix --- lib/rift/Annex.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lib/rift/Annex.py b/lib/rift/Annex.py index 191a5867..e91b8552 100644 --- a/lib/rift/Annex.py +++ b/lib/rift/Annex.py @@ -238,7 +238,9 @@ def list(self): if isinstance(insertion_time, str): for fmt in ('%a %b %d %H:%M:%S %Y', '%a %d %b %Y %H:%M:%S %p %Z'): try: - insertion_time = datetime.datetime.strptime(insertion_time, fmt).timestamp() + insertion_time = datetime.datetime.strptime( + insertion_time, fmt + ).timestamp() break except ValueError: continue @@ -246,7 +248,11 @@ def list(self): elif isinstance(insertion_time, int): insertion_time = insertion_time else: - logging.warning("Unknown time format: %s (type %s)", insertion_time, type(insertion_time)) + logging.warning( + "Unknown time format: %s (type %s)", + insertion_time, + type(insertion_time) + ) insertion_time = insertion_time # The file size must come from the filesystem From 909e2f5a813b1044eee402da5ef79651bd5b4d52 Mon Sep 17 00:00:00 2001 From: Quentin ANIERE Date: Wed, 29 Oct 2025 16:30:06 +0100 Subject: [PATCH 9/9] fix --- lib/rift/Annex.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/lib/rift/Annex.py b/lib/rift/Annex.py index e91b8552..39eeae00 100644 --- a/lib/rift/Annex.py +++ b/lib/rift/Annex.py @@ -125,7 +125,7 @@ def is_pointer(cls, filepath): meta = os.stat(filepath) # MD5 or SHA3 256 - if meta.st_size == 32 or meta.st_size == 64: + if meta.st_size in (32, 64): with open(filepath, encoding='utf-8') as fh: identifier = fh.read(meta.st_size) return all(byte in string.hexdigits for byte in identifier) @@ -244,16 +244,12 @@ def list(self): break except ValueError: continue - # UNIX timestamp - elif isinstance(insertion_time, int): - insertion_time = insertion_time else: logging.warning( "Unknown time format: %s (type %s)", insertion_time, type(insertion_time) ) - insertion_time = insertion_time # The file size must come from the filesystem meta = os.stat(os.path.join(self.path, filename))