diff --git a/.changes/next-release/bugfix-ReadFileChunk-16290.json b/.changes/next-release/bugfix-ReadFileChunk-16290.json new file mode 100644 index 00000000..48ef348a --- /dev/null +++ b/.changes/next-release/bugfix-ReadFileChunk-16290.json @@ -0,0 +1,5 @@ +{ + "type": "bugfix", + "category": "ReadFileChunk", + "description": "Fix seek behavior in ReadFileChunk class" +} diff --git a/s3transfer/bandwidth.py b/s3transfer/bandwidth.py index 9c0b0bcc..07f50963 100644 --- a/s3transfer/bandwidth.py +++ b/s3transfer/bandwidth.py @@ -180,8 +180,8 @@ def signal_not_transferring(self): """Signal that data being read is not being transferred to S3""" self.disable_bandwidth_limiting() - def seek(self, where): - self._fileobj.seek(where) + def seek(self, where, whence=0): + self._fileobj.seek(where, whence) def tell(self): return self._fileobj.tell() diff --git a/s3transfer/upload.py b/s3transfer/upload.py index a6b22a6c..e333a7e9 100644 --- a/s3transfer/upload.py +++ b/s3transfer/upload.py @@ -85,8 +85,8 @@ def read(self, amount=None): raise self._transfer_coordinator.exception return self._fileobj.read(amount) - def seek(self, where): - self._fileobj.seek(where) + def seek(self, where, whence=0): + self._fileobj.seek(where, whence) def tell(self): return self._fileobj.tell() diff --git a/s3transfer/utils.py b/s3transfer/utils.py index 82683d25..67f7e028 100644 --- a/s3transfer/utils.py +++ b/s3transfer/utils.py @@ -363,9 +363,9 @@ def write(self, data): self._open_if_needed() self._fileobj.write(data) - def seek(self, where): + def seek(self, where, whence=0): self._open_if_needed() - self._fileobj.seek(where) + self._fileobj.seek(where, whence) def tell(self): if self._fileobj is None: @@ -425,6 +425,8 @@ def __init__(self, fileobj, chunk_size, full_file_size, self._size = self._calculate_file_size( self._fileobj, requested_size=chunk_size, start_byte=self._start_byte, actual_file_size=full_file_size) + # _amount_read represents the position in the chunk and may exceed + # the chunk size, but won't allow reads out of bounds. self._amount_read = 0 self._callbacks = callbacks if callbacks is None: @@ -473,10 +475,11 @@ def _calculate_file_size(self, fileobj, requested_size, start_byte, return min(max_chunk_size, requested_size) def read(self, amount=None): + amount_left = max(self._size - self._amount_read, 0) if amount is None: - amount_to_read = self._size - self._amount_read + amount_to_read = amount_left else: - amount_to_read = min(self._size - self._amount_read, amount) + amount_to_read = min(amount_left, amount) data = self._fileobj.read(amount_to_read) self._amount_read += len(data) if self._callbacks is not None and self._callbacks_enabled: @@ -499,13 +502,29 @@ def enable_callback(self): def disable_callback(self): self._callbacks_enabled = False - def seek(self, where): - self._fileobj.seek(self._start_byte + where) + def seek(self, where, whence=0): + if whence not in (0, 1, 2): + # Mimic io's error for invalid whence values + raise ValueError( + "invalid whence (%s, should be 0, 1 or 2)" % whence) + + # Recalculate where based on chunk attributes so seek from file + # start (whence=0) is always used + where += self._start_byte + if whence == 1: + where += self._amount_read + elif whence == 2: + where += self._size + + self._fileobj.seek(max(where, self._start_byte)) if self._callbacks is not None and self._callbacks_enabled: # To also rewind the callback() for an accurate progress report + bounded_where = max(min(where - self._start_byte, self._size), 0) + bounded_amount_read = min(self._amount_read, self._size) + amount = bounded_where - bounded_amount_read invoke_progress_callbacks( - self._callbacks, bytes_transferred=where - self._amount_read) - self._amount_read = where + self._callbacks, bytes_transferred=amount) + self._amount_read = max(where - self._start_byte, 0) def close(self): if self._close_callbacks is not None and self._callbacks_enabled: diff --git a/tests/functional/test_manager.py b/tests/functional/test_manager.py index ef579eba..61a69903 100644 --- a/tests/functional/test_manager.py +++ b/tests/functional/test_manager.py @@ -10,7 +10,7 @@ # distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. -from io import RawIOBase +from io import BytesIO from botocore.awsrequest import create_request_object import mock @@ -27,7 +27,7 @@ class ArbitraryException(Exception): pass -class SignalTransferringBody(RawIOBase): +class SignalTransferringBody(BytesIO): """A mocked body with the ability to signal when transfers occur""" def __init__(self): super(SignalTransferringBody, self).__init__() diff --git a/tests/unit/test_bandwidth.py b/tests/unit/test_bandwidth.py index 4f35a7a8..099ca856 100644 --- a/tests/unit/test_bandwidth.py +++ b/tests/unit/test_bandwidth.py @@ -186,7 +186,7 @@ def test_seek(self): stream.seek(1) self.assertEqual( mock_fileobj.seek.call_args_list, - [mock.call(1)] + [mock.call(1, 0)] ) def test_tell(self): diff --git a/tests/unit/test_download.py b/tests/unit/test_download.py index 19f274c0..2e39c8df 100644 --- a/tests/unit/test_download.py +++ b/tests/unit/test_download.py @@ -58,7 +58,7 @@ def __init__(self): self._pos = 0 self.writes = [] - def seek(self, pos): + def seek(self, pos, whence=0): self._pos = pos def write(self, data): diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 733fed04..28fc1bd0 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -493,6 +493,120 @@ def test_tell_and_seek(self): self.assertEqual(chunk.tell(), 3) chunk.seek(0) self.assertEqual(chunk.tell(), 0) + chunk.seek(1, whence=1) + self.assertEqual(chunk.tell(), 1) + chunk.seek(-1, whence=1) + self.assertEqual(chunk.tell(), 0) + chunk.seek(-1, whence=2) + self.assertEqual(chunk.tell(), 2) + + def test_tell_and_seek_boundaries(self): + # Test to ensure ReadFileChunk behaves the same as the + # Python standard library around seeking and reading out + # of bounds in a file object. + data = b'abcdefghij12345678klmnopqrst' + start_pos = 10 + chunk_size = 8 + + # Create test file + filename = os.path.join(self.tempdir, 'foo') + with open(filename, 'wb') as f: + f.write(data) + + # ReadFileChunk should be a substring of only numbers + file_objects = [ + ReadFileChunk.from_filename( + filename, start_byte=start_pos, chunk_size=chunk_size + )] + + # Uncomment next line to validate we match Python's io.BytesIO + # file_objects.append(io.BytesIO(data[start_pos:start_pos+chunk_size])) + + for obj in file_objects: + self._assert_whence_start_behavior(obj) + self._assert_whence_end_behavior(obj) + self._assert_whence_relative_behavior(obj) + self._assert_boundary_behavior(obj) + + def _assert_whence_start_behavior(self, file_obj): + self.assertEqual(file_obj.tell(), 0) + + file_obj.seek(1, 0) + self.assertEqual(file_obj.tell(), 1) + + file_obj.seek(1) + self.assertEqual(file_obj.tell(), 1) + self.assertEqual(file_obj.read(), b'2345678') + + file_obj.seek(3, 0) + self.assertEqual(file_obj.tell(), 3) + + file_obj.seek(0, 0) + self.assertEqual(file_obj.tell(), 0) + + def _assert_whence_relative_behavior(self, file_obj): + self.assertEqual(file_obj.tell(), 0) + + file_obj.seek(2, 1) + self.assertEqual(file_obj.tell(), 2) + + file_obj.seek(1, 1) + self.assertEqual(file_obj.tell(), 3) + self.assertEqual(file_obj.read(), b'45678') + + file_obj.seek(20, 1) + self.assertEqual(file_obj.tell(), 28) + + file_obj.seek(-30, 1) + self.assertEqual(file_obj.tell(), 0) + self.assertEqual(file_obj.read(), b'12345678') + + file_obj.seek(-8, 1) + self.assertEqual(file_obj.tell(), 0) + + def _assert_whence_end_behavior(self, file_obj): + self.assertEqual(file_obj.tell(), 0) + + file_obj.seek(-1, 2) + self.assertEqual(file_obj.tell(), 7) + + file_obj.seek(1, 2) + self.assertEqual(file_obj.tell(), 9) + + file_obj.seek(3, 2) + self.assertEqual(file_obj.tell(), 11) + self.assertEqual(file_obj.read(), b'') + + file_obj.seek(-15, 2) + self.assertEqual(file_obj.tell(), 0) + self.assertEqual(file_obj.read(), b'12345678') + + file_obj.seek(-8, 2) + self.assertEqual(file_obj.tell(), 0) + + def _assert_boundary_behavior(self, file_obj): + # Verify we're at the start + self.assertEqual(file_obj.tell(), 0) + + # Verify we can't move backwards beyond start of file + file_obj.seek(-10, 1) + self.assertEqual(file_obj.tell(), 0) + + # Verify we *can* move after end of file, but return nothing + file_obj.seek(10, 2) + self.assertEqual(file_obj.tell(), 18) + self.assertEqual(file_obj.read(), b'') + self.assertEqual(file_obj.read(10), b'') + + # Verify we can partially rewind + file_obj.seek(-12, 1) + self.assertEqual(file_obj.tell(), 6) + self.assertEqual(file_obj.read(), b'78') + self.assertEqual(file_obj.tell(), 8) + + # Verify we can rewind to start + file_obj.seek(0) + self.assertEqual(file_obj.tell(), 0) def test_file_chunk_supports_context_manager(self): filename = os.path.join(self.tempdir, 'foo') @@ -554,6 +668,91 @@ def test_callback_will_also_be_triggered_by_seek(self): chunk.read(2) self.assertEqual(self.amounts_seen, [2, -2, 2, -1, 2]) + def test_callback_triggered_by_out_of_bound_seeks(self): + data = b'abcdefghij1234567890klmnopqr' + + # Create test file + filename = os.path.join(self.tempdir, 'foo') + with open(filename, 'wb') as f: + f.write(data) + chunk = ReadFileChunk.from_filename( + filename, start_byte=10, chunk_size=10, + callbacks=[self.callback]) + + # Seek calls that generate "0" progress are skipped by + # invoke_progress_callbacks and won't appear in the list. + expected_callback_prog = [10, -5, 5, -1, 1, -1, 1, -5, 5, -10] + + self._assert_out_of_bound_start_seek(chunk, expected_callback_prog) + self._assert_out_of_bound_relative_seek(chunk, expected_callback_prog) + self._assert_out_of_bound_end_seek(chunk, expected_callback_prog) + + def _assert_out_of_bound_start_seek(self, chunk, expected): + # clear amounts_seen + self.amounts_seen = [] + self.assertEqual(self.amounts_seen, []) + + # (position, change) + chunk.seek(20) # (20, 10) + chunk.seek(5) # (5, -5) + chunk.seek(20) # (20, 5) + chunk.seek(9) # (9, -1) + chunk.seek(20) # (20, 1) + chunk.seek(11) # (11, 0) + chunk.seek(20) # (20, 0) + chunk.seek(9) # (9, -1) + chunk.seek(20) # (20, 1) + chunk.seek(5) # (5, -5) + chunk.seek(20) # (20, 5) + chunk.seek(0) # (0, -10) + chunk.seek(0) # (0, 0) + + self.assertEqual(self.amounts_seen, expected) + + def _assert_out_of_bound_relative_seek(self, chunk, expected): + # clear amounts_seen + self.amounts_seen = [] + self.assertEqual(self.amounts_seen, []) + + # (position, change) + chunk.seek(20, 1) # (20, 10) + chunk.seek(-15, 1) # (5, -5) + chunk.seek(15, 1) # (20, 5) + chunk.seek(-11, 1) # (9, -1) + chunk.seek(11, 1) # (20, 1) + chunk.seek(-9, 1) # (11, 0) + chunk.seek(9, 1) # (20, 0) + chunk.seek(-11, 1) # (9, -1) + chunk.seek(11, 1) # (20, 1) + chunk.seek(-15, 1) # (5, -5) + chunk.seek(15, 1) # (20, 5) + chunk.seek(-20, 1) # (0, -10) + chunk.seek(-1000, 1) # (0, 0) + + self.assertEqual(self.amounts_seen, expected) + + def _assert_out_of_bound_end_seek(self, chunk, expected): + # clear amounts_seen + self.amounts_seen = [] + self.assertEqual(self.amounts_seen, []) + + # (position, change) + chunk.seek(10, 2) # (20, 10) + chunk.seek(-5, 2) # (5, -5) + chunk.seek(10, 2) # (20, 5) + chunk.seek(-1, 2) # (9, -1) + chunk.seek(10, 2) # (20, 1) + chunk.seek(1, 2) # (11, 0) + chunk.seek(10, 2) # (20, 0) + chunk.seek(-1, 2) # (9, -1) + chunk.seek(10, 2) # (20, 1) + chunk.seek(-5, 2) # (5, -5) + chunk.seek(10, 2) # (20, 5) + chunk.seek(-10, 2) # (0, -10) + chunk.seek(-1000, 2) # (0, 0) + + self.assertEqual(self.amounts_seen, expected) + def test_close_callbacks(self): with open(self.filename) as f: chunk = ReadFileChunk(f, chunk_size=1, full_file_size=3,