From caf6b5b90a7cbb0cffc9927c8e0b11732b94befe Mon Sep 17 00:00:00 2001 From: Matt Kulka Date: Tue, 1 Aug 2017 10:31:19 -0700 Subject: [PATCH] add catalog functionality to prevent redownloads --- README.md | 6 ++++-- attributes/default.rb | 3 +++ libraries/s3_file.rb | 27 ++++++++++++++++++++++++--- providers/default.rb | 42 +++++++++++++++++++++++++++++++++++------- 4 files changed, 66 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 9c52624..867827a 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,9 @@ An Amazon Web Services account and something in S3 to fetch. Multi-part S3 uploads do not put the MD5 of the content in the ETag header. If x-amz-meta-digest is provided in User-Defined Metadata on the S3 Object it is processed as if it were a Digest header (RFC 3230). -The MD5 of the local file will be checked against the MD5 from x-amz-meta-digest if it is present. It not it will check against the ETag. If there is no match or the local file is absent it will be downloaded. +The MD5 of the local file will be checked against the MD5 from x-amz-meta-digest if it is present. If not it will check against the ETag. If there is no match or the local file is absent it will be downloaded. + +By default, a catalog file in Chef's cache path will be kept for all downloaded files tracking their etag and md5 at time of download. If either of these don't match, the file will be downloaded. To disable this behavior, set `node['s3_file']['use_catalog']` to `false`. If credentials are not provided, s3_file will attempt to use the first instance profile associated with the instance. See documentation at http://docs.aws.amazon.com/IAM/latest/UserGuide/instance-profiles.html for more on instance profiles. @@ -45,7 +47,7 @@ Example: decryption_key "my SHA256 digest key" decrypted_file_checksum "SHA256 hex digest of decrypted file" end - + #MD5 and Multi-Part Upload s3_file compares the MD5 hash of a local file, if present, and the ETag header of the S3 object. If they do not match, then the remote object will be downloaded and notifiations will be fired. diff --git a/attributes/default.rb b/attributes/default.rb index 27e4ebd..e51b0f2 100644 --- a/attributes/default.rb +++ b/attributes/default.rb @@ -1,2 +1,5 @@ default['s3_file']['mime-types']['version'] = '2.6.2' default['s3_file']['rest-client']['version'] = '1.7.3' + +# Keep a catalog of each downloaded file's etag and md5 at time of download. +default['s3_file']['use_catalog'] = true diff --git a/libraries/s3_file.rb b/libraries/s3_file.rb index 814c7bf..2a0c9a6 100644 --- a/libraries/s3_file.rb +++ b/libraries/s3_file.rb @@ -197,6 +197,15 @@ def self.verify_sha256_checksum(checksum, file) def self.verify_md5_checksum(checksum, file) s3_md5 = checksum + local_md5 = buffered_md5_checksum(file) + + Chef::Log.debug "md5 of remote object is #{s3_md5}" + Chef::Log.debug "md5 of local object is #{local_md5.hexdigest}" + + local_md5.hexdigest == s3_md5 + end + + def self.buffered_md5_checksum(file) local_md5 = Digest::MD5.new # buffer the checksum which should save RAM consumption @@ -205,11 +214,23 @@ def self.verify_md5_checksum(checksum, file) local_md5.update buffer end end + local_md5 + end - Chef::Log.debug "md5 of remote object is #{s3_md5}" - Chef::Log.debug "md5 of local object is #{local_md5.hexdigest}" + def self.verify_etag(etag, file) + catalog.fetch(file, nil) == etag + end - local_md5.hexdigest == s3_md5 + def self.catalog_path + File.join(Chef::Config[:file_cache_path], 's3_file_etags.json') + end + + def self.catalog + File.exist?(catalog_path) ? JSON.parse(IO.read(catalog_path)) : {} + end + + def self.write_catalog(data) + File.open(catalog_path, 'w', 0644) { |f| f.write(JSON.dump(data)) } end def self.client diff --git a/providers/default.rb b/providers/default.rb index 28a2439..ff67e15 100644 --- a/providers/default.rb +++ b/providers/default.rb @@ -34,11 +34,11 @@ end if ::File.exists?(new_resource.path) + s3_etag = S3FileLib::get_md5_from_s3(new_resource.bucket, new_resource.s3_url, remote_path, aws_access_key_id, aws_secret_access_key, token) + if decryption_key.nil? if new_resource.decrypted_file_checksum.nil? - s3_md5 = S3FileLib::get_md5_from_s3(new_resource.bucket, new_resource.s3_url, remote_path, aws_access_key_id, aws_secret_access_key, token) - - if S3FileLib::verify_md5_checksum(s3_md5, new_resource.path) + if S3FileLib::verify_md5_checksum(s3_etag, new_resource.path) Chef::Log.debug 'Skipping download, md5sum of local file matches file in S3.' download = false end @@ -59,6 +59,16 @@ end end end + + # Don't download if content and etag match prior download + if node['s3_file']['use_catalog'] + catalog_data = S3FileLib::catalog.fetch(new_resource.path, nil) + existing_file_md5 = S3FileLib::buffered_md5_checksum(new_resource.path) + if catalog_data && existing_file_md5 == catalog_data['local_md5'] && s3_etag == catalog_data['etag'] + Chef::Log.debug 'Skipping download, md5 of local file and etag matches prior download.' + download = false + end + end end if download @@ -78,16 +88,34 @@ raise e end - ::FileUtils.mv(decrypted_file.path, new_resource.path) + downloaded_file = decrypted_file else - ::FileUtils.mv(response.file.path, new_resource.path) + downloaded_file = response.file + end + + # Write etag and md5 to catalog for future reference + if node['s3_file']['use_catalog'] + catalog = S3FileLib::catalog + catalog[new_resource.path] = { + 'etag' => response.headers[:etag].gsub('"',''), + 'local_md5' => S3FileLib::buffered_md5_checksum(downloaded_file.path) + } + S3FileLib::write_catalog(catalog) + end + + # Take ownership and permissions from existing object + if ::File.exist?(new_resource.path) + stat = ::File::Stat.new(new_resource.path) + ::FileUtils.chown(stat.uid, stat.gid, downloaded_file) + ::FileUtils.chmod(stat.mode, downloaded_file) end + ::FileUtils.mv(downloaded_file.path, new_resource.path) end f = file new_resource.path do action :create - owner new_resource.owner || ENV['user'] - group new_resource.group || ENV['user'] + owner new_resource.owner || ENV['USER'] + group new_resource.group || ENV['USER'] mode new_resource.mode || '0644' end