Skip to content

Commit

Permalink
Flexible checksums for S3 high-level commands (#8955)
Browse files Browse the repository at this point in the history
  • Loading branch information
aemous authored Oct 3, 2024
1 parent 2667d4e commit cb60175
Show file tree
Hide file tree
Showing 8 changed files with 335 additions and 1 deletion.
5 changes: 5 additions & 0 deletions .changes/next-release/feature-s3-95496.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"type": "feature",
"category": "s3",
"description": "Adds ``--checksum-mode`` and ``--checksum-algorithm`` parameters to high-level ``s3`` commands."
}
37 changes: 36 additions & 1 deletion awscli/customizations/s3/subcommands.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,17 @@
)
}

CHECKSUM_MODE = {
'name': 'checksum-mode', 'choices': ['ENABLED'],
'help_text': 'To retrieve the checksum, this mode must be enabled. If the object has a '
'checksum, it will be verified.'
}

CHECKSUM_ALGORITHM = {
'name': 'checksum-algorithm', 'choices': ['CRC32', 'SHA256', 'SHA1', 'CRC32C'],
'help_text': 'Indicates the algorithm used to create the checksum for the object.'
}

TRANSFER_ARGS = [DRYRUN, QUIET, INCLUDE, EXCLUDE, ACL,
FOLLOW_SYMLINKS, NO_FOLLOW_SYMLINKS, NO_GUESS_MIME_TYPE,
SSE, SSE_C, SSE_C_KEY, SSE_KMS_KEY_ID, SSE_C_COPY_SOURCE,
Expand All @@ -459,7 +470,7 @@
CONTENT_DISPOSITION, CONTENT_ENCODING, CONTENT_LANGUAGE,
EXPIRES, SOURCE_REGION, ONLY_SHOW_ERRORS, NO_PROGRESS,
PAGE_SIZE, IGNORE_GLACIER_WARNINGS, FORCE_GLACIER_TRANSFER,
REQUEST_PAYER]
REQUEST_PAYER, CHECKSUM_MODE, CHECKSUM_ALGORITHM]


def get_client(session, region, endpoint_url, verify, config=None):
Expand Down Expand Up @@ -1242,6 +1253,17 @@ def _validate_path_args(self):
if self._should_emit_validate_s3_paths_warning():
self._emit_validate_s3_paths_warning()

if params.get('checksum_algorithm'):
self._raise_if_paths_type_incorrect_for_param(
CHECKSUM_ALGORITHM['name'],
params['paths_type'],
['locals3', 's3s3'])
if params.get('checksum_mode'):
self._raise_if_paths_type_incorrect_for_param(
CHECKSUM_MODE['name'],
params['paths_type'],
['s3local'])

# If the user provided local path does not exist, hard fail because
# we know that we will not be able to upload the file.
if 'locals3' == params['paths_type'] and not params['is_stream']:
Expand Down Expand Up @@ -1325,6 +1347,19 @@ def _raise_if_mv_same_paths(self, src, dest):
f"{self.parameters['src']} - {self.parameters['dest']}"
)

def _raise_if_paths_type_incorrect_for_param(self, param, paths_type, allowed_paths):
if paths_type not in allowed_paths:
expected_usage_map = {
'locals3': '<LocalPath> <S3Uri>',
's3s3': '<S3Uri> <S3Uri>',
's3local': '<S3Uri> <LocalPath>',
's3': '<S3Uri>'
}
raise ValueError(
f"Expected {param} parameter to be used with one of following path formats: "
f"{', '.join([expected_usage_map[path] for path in allowed_paths])}. Instead, received {expected_usage_map[paths_type]}."
)

def _normalize_s3_trailing_slash(self, paths):
for i, path in enumerate(paths):
if path.startswith('s3://'):
Expand Down
13 changes: 13 additions & 0 deletions awscli/customizations/s3/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,12 +474,14 @@ def map_put_object_params(cls, request_params, cli_params):
cls._set_sse_request_params(request_params, cli_params)
cls._set_sse_c_request_params(request_params, cli_params)
cls._set_request_payer_param(request_params, cli_params)
cls._set_checksum_algorithm_param(request_params, cli_params)

@classmethod
def map_get_object_params(cls, request_params, cli_params):
"""Map CLI params to GetObject request params"""
cls._set_sse_c_request_params(request_params, cli_params)
cls._set_request_payer_param(request_params, cli_params)
cls._set_checksum_mode_param(request_params, cli_params)

@classmethod
def map_copy_object_params(cls, request_params, cli_params):
Expand All @@ -492,6 +494,7 @@ def map_copy_object_params(cls, request_params, cli_params):
cls._set_sse_c_and_copy_source_request_params(
request_params, cli_params)
cls._set_request_payer_param(request_params, cli_params)
cls._set_checksum_algorithm_param(request_params, cli_params)

@classmethod
def map_head_object_params(cls, request_params, cli_params):
Expand Down Expand Up @@ -534,6 +537,16 @@ def _set_request_payer_param(cls, request_params, cli_params):
if cli_params.get('request_payer'):
request_params['RequestPayer'] = cli_params['request_payer']

@classmethod
def _set_checksum_mode_param(cls, request_params, cli_params):
if cli_params.get('checksum_mode'):
request_params['ChecksumMode'] = cli_params['checksum_mode']

@classmethod
def _set_checksum_algorithm_param(cls, request_params, cli_params):
if cli_params.get('checksum_algorithm'):
request_params['ChecksumAlgorithm'] = cli_params['checksum_algorithm']

@classmethod
def _set_general_object_params(cls, request_params, cli_params):
# Parameters set in this method should be applicable to the following
Expand Down
81 changes: 81 additions & 0 deletions tests/functional/s3/test_cp_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -693,6 +693,87 @@ def test_cp_with_error_and_warning_permissions(self):
self.assertIn('upload failed', stderr)
self.assertIn('warning: File has an invalid timestamp.', stderr)

def test_upload_with_checksum_algorithm_crc32(self):
full_path = self.files.create_file('foo.txt', 'contents')
cmdline = f'{self.prefix} {full_path} s3://bucket/key.txt --checksum-algorithm CRC32'
self.run_cmd(cmdline, expected_rc=0)
self.assertEqual(self.operations_called[0][0].name, 'PutObject')
self.assertEqual(self.operations_called[0][1]['ChecksumAlgorithm'], 'CRC32')

@requires_crt
def test_upload_with_checksum_algorithm_crc32c(self):
full_path = self.files.create_file('foo.txt', 'contents')
cmdline = f'{self.prefix} {full_path} s3://bucket/key.txt --checksum-algorithm CRC32C'
self.run_cmd(cmdline, expected_rc=0)
self.assertEqual(self.operations_called[0][0].name, 'PutObject')
self.assertEqual(self.operations_called[0][1]['ChecksumAlgorithm'], 'CRC32C')

def test_multipart_upload_with_checksum_algorithm_crc32(self):
full_path = self.files.create_file('foo.txt', 'a' * 10 * (1024 ** 2))
self.parsed_responses = [
{'UploadId': 'foo'},
{'ETag': 'foo-e1', 'ChecksumCRC32': 'foo-1'},
{'ETag': 'foo-e2', 'ChecksumCRC32': 'foo-2'},
{}
]
cmdline = ('%s %s s3://bucket/key2.txt'
' --checksum-algorithm CRC32' % (self.prefix, full_path))
self.run_cmd(cmdline, expected_rc=0)
self.assertEqual(len(self.operations_called), 4, self.operations_called)
self.assertEqual(self.operations_called[0][0].name, 'CreateMultipartUpload')
self.assertEqual(self.operations_called[0][1]['ChecksumAlgorithm'], 'CRC32')
self.assertEqual(self.operations_called[1][0].name, 'UploadPart')
self.assertEqual(self.operations_called[1][1]['ChecksumAlgorithm'], 'CRC32')
self.assertEqual(self.operations_called[3][0].name, 'CompleteMultipartUpload')
self.assertIn({'ETag': 'foo-e1', 'ChecksumCRC32': 'foo-1', 'PartNumber': 1},
self.operations_called[3][1]['MultipartUpload']['Parts'])
self.assertIn({'ETag': 'foo-e2', 'ChecksumCRC32': 'foo-2', 'PartNumber': 2},
self.operations_called[3][1]['MultipartUpload']['Parts'])

def test_copy_with_checksum_algorithm_crc32(self):
self.parsed_responses = [
self.head_object_response(),
# Mocked CopyObject response with a CRC32 checksum specified
{
'ETag': 'foo-1',
'ChecksumCRC32': 'Tq0H4g=='
}
]
cmdline = f'{self.prefix} s3://bucket1/key.txt s3://bucket2/key.txt --checksum-algorithm CRC32'
self.run_cmd(cmdline, expected_rc=0)
self.assertEqual(self.operations_called[1][0].name, 'CopyObject')
self.assertEqual(self.operations_called[1][1]['ChecksumAlgorithm'], 'CRC32')

def test_download_with_checksum_mode_crc32(self):
self.parsed_responses = [
self.head_object_response(),
# Mocked GetObject response with a checksum algorithm specified
{
'ETag': 'foo-1',
'ChecksumCRC32': 'Tq0H4g==',
'Body': BytesIO(b'foo')
}
]
cmdline = f'{self.prefix} s3://bucket/foo {self.files.rootdir} --checksum-mode ENABLED'
self.run_cmd(cmdline, expected_rc=0)
self.assertEqual(self.operations_called[1][0].name, 'GetObject')
self.assertEqual(self.operations_called[1][1]['ChecksumMode'], 'ENABLED')

def test_download_with_checksum_mode_crc32c(self):
self.parsed_responses = [
self.head_object_response(),
# Mocked GetObject response with a checksum algorithm specified
{
'ETag': 'foo-1',
'ChecksumCRC32C': 'checksum',
'Body': BytesIO(b'foo')
}
]
cmdline = f'{self.prefix} s3://bucket/foo {self.files.rootdir} --checksum-mode ENABLED'
self.run_cmd(cmdline, expected_rc=0)
self.assertEqual(self.operations_called[1][0].name, 'GetObject')
self.assertEqual(self.operations_called[1][1]['ChecksumMode'], 'ENABLED')


class TestStreamingCPCommand(BaseAWSCommandParamsTest):
def test_streaming_upload(self):
Expand Down
23 changes: 23 additions & 0 deletions tests/functional/s3/test_mv_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,29 @@ def test_copy_move_with_request_payer(self):
]
)

def test_upload_with_checksum_algorithm_crc32(self):
full_path = self.files.create_file('foo.txt', 'contents')
cmdline = f'{self.prefix} {full_path} s3://bucket/key.txt --checksum-algorithm CRC32'
self.run_cmd(cmdline, expected_rc=0)
self.assertEqual(self.operations_called[0][0].name, 'PutObject')
self.assertEqual(self.operations_called[0][1]['ChecksumAlgorithm'], 'CRC32')

def test_download_with_checksum_mode_crc32(self):
self.parsed_responses = [
self.head_object_response(),
# Mocked GetObject response with a checksum algorithm specified
{
'ETag': 'foo-1',
'ChecksumCRC32': 'checksum',
'Body': BytesIO(b'foo')
},
self.delete_object_response()
]
cmdline = f'{self.prefix} s3://bucket/foo {self.files.rootdir} --checksum-mode ENABLED'
self.run_cmd(cmdline, expected_rc=0)
self.assertEqual(self.operations_called[1][0].name, 'GetObject')
self.assertEqual(self.operations_called[1][1]['ChecksumMode'], 'ENABLED')


class TestMvCommandWithValidateSameS3Paths(BaseS3TransferCommandTest):

Expand Down
87 changes: 87 additions & 0 deletions tests/functional/s3/test_sync_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,93 @@ def test_with_accesspoint_arn(self):
]
)

def test_upload_with_checksum_algorithm_sha1(self):
self.files.create_file('foo.txt', 'contents')
cmdline = f'{self.prefix} {self.files.rootdir} s3://bucket/ --checksum-algorithm SHA1'
self.run_cmd(cmdline, expected_rc=0)
self.assertEqual(self.operations_called[1][0].name, 'PutObject')
self.assertEqual(self.operations_called[1][1]['ChecksumAlgorithm'], 'SHA1')

def test_copy_with_checksum_algorithm_update_sha1(self):
cmdline = f'{self.prefix} s3://src-bucket/ s3://dest-bucket/ --checksum-algorithm SHA1'
self.parsed_responses = [
# Response for ListObjects on source bucket
{
'Contents': [
{
'Key': 'mykey',
'LastModified': '00:00:00Z',
'Size': 100,
'ChecksumAlgorithm': 'SHA1'
}
],
'CommonPrefixes': []
},
# Response for ListObjects on destination bucket
self.list_objects_response([]),
# Response for CopyObject
{
'ChecksumSHA1': 'sha1-checksum'
}
]
self.run_cmd(cmdline, expected_rc=0)
self.assert_operations_called(
[
self.list_objects_request('src-bucket'),
self.list_objects_request('dest-bucket'),
(
'CopyObject', {
'CopySource': {
'Bucket': 'src-bucket',
'Key': 'mykey'
},
'Bucket': 'dest-bucket',
'Key': 'mykey',
'ChecksumAlgorithm': 'SHA1'
}
)
]
)

def test_upload_with_checksum_algorithm_sha256(self):
self.files.create_file('foo.txt', 'contents')
cmdline = f'{self.prefix} {self.files.rootdir} s3://bucket/ --checksum-algorithm SHA256'
self.run_cmd(cmdline, expected_rc=0)
self.assertEqual(self.operations_called[1][0].name, 'PutObject')
self.assertEqual(self.operations_called[1][1]['ChecksumAlgorithm'], 'SHA256')

def test_download_with_checksum_mode_sha1(self):
self.parsed_responses = [
self.list_objects_response(['bucket']),
# Mocked GetObject response with a checksum algorithm specified
{
'ETag': 'foo-1',
'ChecksumSHA1': 'checksum',
'Body': BytesIO(b'foo')
}
]
cmdline = f'{self.prefix} s3://bucket/foo {self.files.rootdir} --checksum-mode ENABLED'
self.run_cmd(cmdline, expected_rc=0)
self.assertEqual(self.operations_called[0][0].name, 'ListObjectsV2')
self.assertEqual(self.operations_called[1][0].name, 'GetObject')
self.assertIn(('ChecksumMode', 'ENABLED'), self.operations_called[1][1].items())

def test_download_with_checksum_mode_sha256(self):
self.parsed_responses = [
self.list_objects_response(['bucket']),
# Mocked GetObject response with a checksum algorithm specified
{
'ETag': 'foo-1',
'ChecksumSHA256': 'checksum',
'Body': BytesIO(b'foo')
}
]
cmdline = f'{self.prefix} s3://bucket/foo {self.files.rootdir} --checksum-mode ENABLED'
self.run_cmd(cmdline, expected_rc=0)
self.assertEqual(self.operations_called[0][0].name, 'ListObjectsV2')
self.assertEqual(self.operations_called[1][0].name, 'GetObject')
self.assertIn(('ChecksumMode', 'ENABLED'), self.operations_called[1][1].items())


class TestSyncCommandWithS3Express(BaseS3TransferCommandTest):

Expand Down
40 changes: 40 additions & 0 deletions tests/unit/customizations/s3/test_subcommands.py
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,46 @@ def test_validate_no_streaming_paths(self):
cmd_params.add_paths(paths)
self.assertFalse(cmd_params.parameters['is_stream'])

def test_validate_checksum_algorithm_download_error(self):
paths = ['s3://bucket/key', self.file_creator.rootdir]
parameters = {'checksum_algorithm': 'CRC32'}
cmd_params = CommandParameters('cp', parameters, '')
with self.assertRaises(ValueError) as cm:
cmd_params.add_paths(paths)
self.assertIn('Expected checksum-algorithm parameter to be used with one of following path formats', cm.msg)

def test_validate_checksum_algorithm_sync_download_error(self):
paths = ['s3://bucket/key', self.file_creator.rootdir]
parameters = {'checksum_algorithm': 'CRC32C'}
cmd_params = CommandParameters('sync', parameters, '')
with self.assertRaises(ValueError) as cm:
cmd_params.add_paths(paths)
self.assertIn('Expected checksum-algorithm parameter to be used with one of following path formats', cm.msg)

def test_validate_checksum_mode_upload_error(self):
paths = [self.file_creator.rootdir, 's3://bucket/key']
parameters = {'checksum_mode': 'ENABLED'}
cmd_params = CommandParameters('cp', parameters, '')
with self.assertRaises(ValueError) as cm:
cmd_params.add_paths(paths)
self.assertIn('Expected checksum-mode parameter to be used with one of following path formats', cm.msg)

def test_validate_checksum_mode_sync_upload_error(self):
paths = [self.file_creator.rootdir, 's3://bucket/key']
parameters = {'checksum_mode': 'ENABLED'}
cmd_params = CommandParameters('sync', parameters, '')
with self.assertRaises(ValueError) as cm:
cmd_params.add_paths(paths)
self.assertIn('Expected checksum-mode parameter to be used with one of following path formats', cm.msg)

def test_validate_checksum_mode_move_error(self):
paths = ['s3://bucket/key', 's3://bucket2/key']
parameters = {'checksum_mode': 'ENABLED'}
cmd_params = CommandParameters('mv', parameters, '')
with self.assertRaises(ValueError) as cm:
cmd_params.add_paths(paths)
self.assertIn('Expected checksum-mode parameter to be used with one of following path formats', cm.msg)

def test_validate_streaming_paths_error(self):
parameters = {'src': '-', 'dest': 's3://bucket'}
cmd_params = CommandParameters('sync', parameters, '')
Expand Down
Loading

0 comments on commit cb60175

Please sign in to comment.