From 756448f2d996895658dfe11188710197aacd8b94 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Wed, 16 Oct 2019 13:58:11 -0500 Subject: [PATCH 1/3] Implement sync_directory for BuildMediaStorageMixin --- readthedocs/builds/storage.py | 41 ++++++++++++++- .../rtd_tests/tests/test_build_storage.py | 52 +++++++++++++++++++ 2 files changed, 91 insertions(+), 2 deletions(-) diff --git a/readthedocs/builds/storage.py b/readthedocs/builds/storage.py index 0b96636a9c1..dc54b756919 100644 --- a/readthedocs/builds/storage.py +++ b/readthedocs/builds/storage.py @@ -4,8 +4,7 @@ from django.conf import settings from django.core.exceptions import SuspiciousFileOperation from django.core.files.storage import FileSystemStorage -from storages.utils import safe_join, get_available_overwrite_name - +from storages.utils import get_available_overwrite_name, safe_join log = logging.getLogger(__name__) @@ -88,6 +87,44 @@ def copy_directory(self, source, destination): with filepath.open('rb') as fd: self.save(sub_destination, fd) + def sync_directory(self, source, destination): + """ + Sync a directory recursively to storage. + + Overwrites files in remote storage where a file in ``source`` exists (no timstamp checking done). + Removes files and folders in remote storage that are not present in ``source``. + + :param source: the source path on the local disk + :param destination: the destination path in storage + """ + + log.debug( + 'Syncing to media storage. source=%s destination=%s', + source, destination, + ) + source = Path(source) + copied_files = set() + copied_dirs = set() + for filepath in source.iterdir(): + sub_destination = self.join(destination, filepath.name) + if filepath.is_dir(): + # Recursively sync the subdirectory + self.sync_directory(filepath, sub_destination) + copied_dirs.add(filepath.name) + elif filepath.is_file(): + with filepath.open('rb') as fd: + self.save(sub_destination, fd) + copied_files.add(filepath.name) + + # Remove files that are not present in ``source`` + dest_folders, dest_files = self.listdir(self._dirpath(destination)) + for folder in dest_folders: + if folder not in copied_dirs: + self.delete_directory(self.join(destination, folder)) + for filename in dest_files: + if filename not in copied_files: + self.delete(self.join(destination, filename)) + def join(self, directory, filepath): return safe_join(directory, filepath) diff --git a/readthedocs/rtd_tests/tests/test_build_storage.py b/readthedocs/rtd_tests/tests/test_build_storage.py index 0b8443e3a09..326c01d0d71 100644 --- a/readthedocs/rtd_tests/tests/test_build_storage.py +++ b/readthedocs/rtd_tests/tests/test_build_storage.py @@ -18,6 +18,25 @@ def setUp(self): def tearDown(self): shutil.rmtree(self.test_media_dir, ignore_errors=True) + def assertFileTree(self, source, tree): + """ + Recursively check that ``source`` from storage has the same file tree as ``tree``. + + :param source: source path in storage + :param tree: a list of strings representing files + or tuples (string, list) representing directories. + """ + dirs_tree = [e for e in tree if not isinstance(e, str)] + + dirs, files = self.storage.listdir(source) + expected_dirs = [e[0] for e in dirs_tree] + expected_files = [e for e in tree if isinstance(e, str)] + self.assertCountEqual(dirs, expected_dirs) + self.assertCountEqual(files, expected_files) + + for folder, files in dirs_tree: + self.assertFileTree(self.storage.join(source, folder), files) + def test_copy_directory(self): self.assertFalse(self.storage.exists('files/test.html')) @@ -27,6 +46,39 @@ def test_copy_directory(self): self.assertTrue(self.storage.exists('files/api.fjson')) self.assertTrue(self.storage.exists('files/api/index.html')) + def test_sync_directory(self): + tmp_files_dir = os.path.join(tempfile.mkdtemp(), 'files') + shutil.copytree(files_dir, tmp_files_dir) + storage_dir = 'files' + + tree = [ + ('api', ['index.html']), + 'api.fjson', + 'conf.py', + 'test.html', + ] + self.storage.sync_directory(tmp_files_dir, storage_dir) + self.assertFileTree(storage_dir, tree) + + tree = [ + ('api', ['index.html']), + 'conf.py', + 'test.html', + ] + os.remove(os.path.join(tmp_files_dir, 'api.fjson')) + self.storage.sync_directory(tmp_files_dir, storage_dir) + self.assertFileTree(storage_dir, tree) + + tree = [ + # Cloud storage generally doesn't consider empty directories to exist + ('api', []), + 'conf.py', + 'test.html', + ] + shutil.rmtree(os.path.join(tmp_files_dir, 'api')) + self.storage.sync_directory(tmp_files_dir, storage_dir) + self.assertFileTree(storage_dir, tree) + def test_delete_directory(self): self.storage.copy_directory(files_dir, 'files') dirs, files = self.storage.listdir('files') From 175c54f054a7cda2fcde0d2a41a36c85c95149cf Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Wed, 16 Oct 2019 14:02:44 -0500 Subject: [PATCH 2/3] Use sync instead of copy --- readthedocs/projects/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py index b5fa0402e99..60a34f248fc 100644 --- a/readthedocs/projects/tasks.py +++ b/readthedocs/projects/tasks.py @@ -847,7 +847,7 @@ def store_build_artifacts( }, ) try: - storage.copy_directory(from_path, to_path) + storage.sync_directory(from_path, to_path) except Exception: # Ideally this should just be an IOError # but some storage backends unfortunately throw other errors From 9a1e4de60f36e488a26651f25ee76d459721ec90 Mon Sep 17 00:00:00 2001 From: David Fischer Date: Wed, 16 Oct 2019 14:11:48 -0700 Subject: [PATCH 3/3] Small cleanup bits - Linting fix - Check against suspicious operations - Additional logging --- readthedocs/builds/storage.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/readthedocs/builds/storage.py b/readthedocs/builds/storage.py index dc54b756919..240fdcb6641 100644 --- a/readthedocs/builds/storage.py +++ b/readthedocs/builds/storage.py @@ -91,12 +91,14 @@ def sync_directory(self, source, destination): """ Sync a directory recursively to storage. - Overwrites files in remote storage where a file in ``source`` exists (no timstamp checking done). + Overwrites files in remote storage with files from ``source`` (no timstamp/hash checking). Removes files and folders in remote storage that are not present in ``source``. :param source: the source path on the local disk :param destination: the destination path in storage """ + if destination in ('', '/'): + raise SuspiciousFileOperation('Syncing all storage cannot be right') log.debug( 'Syncing to media storage. source=%s destination=%s', @@ -123,7 +125,9 @@ def sync_directory(self, source, destination): self.delete_directory(self.join(destination, folder)) for filename in dest_files: if filename not in copied_files: - self.delete(self.join(destination, filename)) + filepath = self.join(destination, filename) + log.debug('Deleting file from media storage. file=%s', filepath) + self.delete(filepath) def join(self, directory, filepath): return safe_join(directory, filepath)