-
Notifications
You must be signed in to change notification settings - Fork 4
Annex.py: Update hash algorithm and switch to UNIX timestamps #11
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -90,7 +90,7 @@ def is_binary(filepath, blocksize=65536): | |
|
|
||
| def hashfile(filepath, iosize=65536): | ||
| """Compute a digest of filepath content.""" | ||
| hasher = hashlib.md5() | ||
| hasher = hashlib.sha3_256() | ||
| with open(filepath, 'rb') as srcfile: | ||
| buf = srcfile.read(iosize) | ||
| while len(buf) > 0: | ||
|
|
@@ -123,10 +123,20 @@ def is_pointer(cls, filepath): | |
| identifier. | ||
| """ | ||
| meta = os.stat(filepath) | ||
|
|
||
| # MD5 | ||
| if meta.st_size == 32: | ||
| logging.warning("Using deprecated hash algorithm (MD5)") | ||
qa-cea marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| with open(filepath, encoding='utf-8') as fh: | ||
| identifier = fh.read(32) | ||
| return all(byte in string.hexdigits for byte in identifier) | ||
|
|
||
| # SHA3 256 | ||
| elif meta.st_size == 64: | ||
| with open(filepath, encoding='utf-8') as fh: | ||
| identifier = fh.read(64) | ||
| return all(byte in string.hexdigits for byte in identifier) | ||
|
|
||
|
Comment on lines
128
to
+139
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. minor: merge the two conditions
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
| return False | ||
|
|
||
| def get(self, identifier, destpath): | ||
|
|
@@ -226,13 +236,29 @@ def list(self): | |
| if not filename.endswith('.info'): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggest: change the condition to check it ends with a
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
| info = self._load_metadata(filename) | ||
| names = info.get('filenames', []) | ||
| for annexed_file in names.values(): | ||
| insertion_time = annexed_file['date'] | ||
| insertion_time = datetime.datetime.strptime(insertion_time, "%c").timestamp() | ||
|
|
||
| #The file size must come from the filesystem | ||
| meta = os.stat(os.path.join(self.path, filename)) | ||
| yield filename, meta.st_size, insertion_time, names | ||
| for annexed_file, details in names.items(): | ||
| insertion_time = details['date'] | ||
|
|
||
| # Handle different date formats (old method) | ||
| if isinstance(insertion_time, str): | ||
| for fmt in ('%a %b %d %H:%M:%S %Y', '%a %d %b %Y %H:%M:%S %p %Z'): | ||
| try: | ||
| insertion_time = datetime.datetime.strptime(insertion_time, fmt).timestamp() | ||
| break | ||
| except ValueError: | ||
| continue | ||
| else: | ||
| raise ValueError(f"Invalid date format in metadata: {insertion_time}") | ||
|
|
||
| # UNIX timestamp | ||
| elif isinstance(insertion_time, (int, float)): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. question: is the float type really possible ? Seems unlikely for a timestamp
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
| insertion_time = insertion_time | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. major: don't think this line is really necessary
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. to me there is 3 options :
if unknown format happen and it not handled, Rift will crash |
||
| else: | ||
| raise ValueError("Invalid date format in metadata") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggest: print the type of the object
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
|
|
||
| # The file size must come from the filesystem | ||
| meta = os.stat(os.path.join(self.path, filename)) | ||
| yield filename, meta.st_size, insertion_time, [annexed_file] | ||
|
|
||
| def push(self, filepath): | ||
| """ | ||
|
|
@@ -254,21 +280,21 @@ def push(self, filepath): | |
| destinfo = None | ||
| if os.path.exists(destpath): | ||
| destinfo = os.stat(destpath) | ||
| if destinfo and destinfo.st_size == originfo.st_size and \ | ||
| filename in metadata.get('filenames', {}): | ||
| logging.debug('%s is already into annex, skipping it', filename) | ||
|
|
||
| else: | ||
| # Update them and write them back | ||
| fileset = metadata.setdefault('filenames', {}) | ||
| fileset.setdefault(filename, {}) | ||
| fileset[filename]['date'] = time.strftime("%c") | ||
| self._save_metadata(digest, metadata) | ||
|
|
||
| # Move binary file to annex | ||
| logging.debug('Importing %s into annex (%s)', filepath, digest) | ||
| shutil.copyfile(filepath, destpath) | ||
| os.chmod(destpath, self.WMODE) | ||
| if destinfo and destinfo.st_size == originfo.st_size and \ | ||
| filename in metadata.get('filenames', {}): | ||
| logging.debug('%s is already into annex, skipping it', filename) | ||
| return | ||
|
Comment on lines
281
to
+286
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. question: shouldn't we return an error or at least continue if
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. comment noted for the next MR (S3 annex) |
||
|
|
||
| # Update them and write them back | ||
| fileset = metadata.setdefault('filenames', {}) | ||
| fileset.setdefault(filename, {}) | ||
| fileset[filename]['date'] = time.time() # Unix timestamp | ||
| self._save_metadata(digest, metadata) | ||
|
|
||
| # Move binary file to annex | ||
| logging.debug('Importing %s into annex (%s)', filepath, digest) | ||
| shutil.copyfile(filepath, destpath) | ||
| os.chmod(destpath, self.WMODE) | ||
|
|
||
| # Verify permission are correct before copying | ||
| os.chmod(filepath, self.RMODE) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
question: is there no other way to know the type of hash being used ? Feels really fragile
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
MD 5 hashes will always be this size, some other algorithms may have similar hash size but they are rare.
In Rift case, this method as always be used so we can always expect hash this size being MD5.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I will also add that there is no point of supporting mutliple hashes : it will either be MD5 or SHA256