Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 49 additions & 23 deletions lib/rift/Annex.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def is_binary(filepath, blocksize=65536):

def hashfile(filepath, iosize=65536):
"""Compute a digest of filepath content."""
hasher = hashlib.md5()
hasher = hashlib.sha3_256()
with open(filepath, 'rb') as srcfile:
buf = srcfile.read(iosize)
while len(buf) > 0:
Expand Down Expand Up @@ -123,10 +123,20 @@ def is_pointer(cls, filepath):
identifier.
"""
meta = os.stat(filepath)

# MD5
if meta.st_size == 32:
Comment on lines +127 to 128
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: is there no other way to know the type of hash being used ? Feels really fragile

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MD 5 hashes will always be this size, some other algorithms may have similar hash size but they are rare.
In Rift case, this method as always be used so we can always expect hash this size being MD5.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will also add that there is no point of supporting mutliple hashes : it will either be MD5 or SHA256

logging.warning("Using deprecated hash algorithm (MD5)")
with open(filepath, encoding='utf-8') as fh:
identifier = fh.read(32)
return all(byte in string.hexdigits for byte in identifier)

# SHA3 256
elif meta.st_size == 64:
with open(filepath, encoding='utf-8') as fh:
identifier = fh.read(64)
return all(byte in string.hexdigits for byte in identifier)

Comment on lines 128 to +139
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor: merge the two conditions

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

return False

def get(self, identifier, destpath):
Expand Down Expand Up @@ -226,13 +236,29 @@ def list(self):
if not filename.endswith('.info'):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggest: change the condition to check it ends with a .info and continue if true

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

info = self._load_metadata(filename)
names = info.get('filenames', [])
for annexed_file in names.values():
insertion_time = annexed_file['date']
insertion_time = datetime.datetime.strptime(insertion_time, "%c").timestamp()

#The file size must come from the filesystem
meta = os.stat(os.path.join(self.path, filename))
yield filename, meta.st_size, insertion_time, names
for annexed_file, details in names.items():
insertion_time = details['date']

# Handle different date formats (old method)
if isinstance(insertion_time, str):
for fmt in ('%a %b %d %H:%M:%S %Y', '%a %d %b %Y %H:%M:%S %p %Z'):
try:
insertion_time = datetime.datetime.strptime(insertion_time, fmt).timestamp()
break
except ValueError:
continue
else:
raise ValueError(f"Invalid date format in metadata: {insertion_time}")

# UNIX timestamp
elif isinstance(insertion_time, (int, float)):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: is the float type really possible ? Seems unlikely for a timestamp

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

insertion_time = insertion_time
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

major: don't think this line is really necessary

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to me there is 3 options :

  • old format
  • new format
  • unknown format

if unknown format happen and it not handled, Rift will crash

else:
raise ValueError("Invalid date format in metadata")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggest: print the type of the object

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done


# The file size must come from the filesystem
meta = os.stat(os.path.join(self.path, filename))
yield filename, meta.st_size, insertion_time, [annexed_file]

def push(self, filepath):
"""
Expand All @@ -254,21 +280,21 @@ def push(self, filepath):
destinfo = None
if os.path.exists(destpath):
destinfo = os.stat(destpath)
if destinfo and destinfo.st_size == originfo.st_size and \
filename in metadata.get('filenames', {}):
logging.debug('%s is already into annex, skipping it', filename)

else:
# Update them and write them back
fileset = metadata.setdefault('filenames', {})
fileset.setdefault(filename, {})
fileset[filename]['date'] = time.strftime("%c")
self._save_metadata(digest, metadata)

# Move binary file to annex
logging.debug('Importing %s into annex (%s)', filepath, digest)
shutil.copyfile(filepath, destpath)
os.chmod(destpath, self.WMODE)
if destinfo and destinfo.st_size == originfo.st_size and \
filename in metadata.get('filenames', {}):
logging.debug('%s is already into annex, skipping it', filename)
return
Comment on lines 281 to +286
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: shouldn't we return an error or at least continue if os.path.exists(destpath) is false ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comment noted for the next MR (S3 annex)


# Update them and write them back
fileset = metadata.setdefault('filenames', {})
fileset.setdefault(filename, {})
fileset[filename]['date'] = time.time() # Unix timestamp
self._save_metadata(digest, metadata)

# Move binary file to annex
logging.debug('Importing %s into annex (%s)', filepath, digest)
shutil.copyfile(filepath, destpath)
os.chmod(destpath, self.WMODE)

# Verify permission are correct before copying
os.chmod(filepath, self.RMODE)
Expand Down
13 changes: 4 additions & 9 deletions tests/Annex.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,22 +192,17 @@ def test_delete(self):
self.annex.get_by_path(source_file.name, '/dev/null')

def test_list(self):
""" Test list method """
"""Test the list method"""

source_size = os.stat(self.source.name).st_size
source_insertion_time = datetime.datetime.strptime(time.strftime('%c'), '%c').timestamp()
# Get the current time with the %c format and convert it to unix timestamp to
# have the same method as annex.list (in terms of precision)
source_insertion_time = time.time()
self.annex.push(self.source.name)

# Check if the file pointer is present in the annex list output
# by checking it's attributes
for filename, size, insertion_time, names in self.annex.list():
self.assertEqual(get_digest_from_path(self.source.name), filename)
self.assertEqual(source_size, size)
# As tests can take time to run, accept less or equal 1 second shift
self.assertAlmostEqual(source_insertion_time, insertion_time, delta=1)
self.assertTrue(os.path.basename(self.source.name) in names.keys())
self.assertAlmostEqual(source_insertion_time, insertion_time, delta=1) # delta for potentials delay
self.assertTrue(os.path.basename(self.source.name) in names)

def test_push(self):
""" Test push method """
Expand Down