From 16148276ee5ddbe600f4e66fe4bc35c7d238b75b Mon Sep 17 00:00:00 2001 From: rafsaf Date: Sun, 13 Aug 2023 12:40:34 +0200 Subject: [PATCH] Improve folder and file performance using symlinks, added ZIP_SKIP_INTEGRITY_CHECK and some refactoring --- CNAME | 1 - backuper/backup_targets/file.py | 8 ++-- backuper/backup_targets/folder.py | 8 ++-- backuper/backup_targets/mariadb.py | 6 +-- backuper/backup_targets/mysql.py | 6 +-- backuper/backup_targets/postgresql.py | 6 +-- backuper/config.py | 1 + backuper/core.py | 46 ++++++++++++++----- backuper/main.py | 2 +- backuper/upload_providers/base_provider.py | 2 +- backuper/upload_providers/debug.py | 11 +---- .../upload_providers/google_cloud_storage.py | 6 +-- docs/configuration.md | 3 +- pyproject.toml | 2 +- tests/test_core.py | 6 ++- tests/test_main.py | 6 +-- tests/test_storage_provider_gcs.py | 4 +- tests/test_storage_provider_local.py | 4 +- 18 files changed, 72 insertions(+), 56 deletions(-) delete mode 100644 CNAME diff --git a/CNAME b/CNAME deleted file mode 100644 index 28ebfce..0000000 --- a/CNAME +++ /dev/null @@ -1 +0,0 @@ -backuper.rafsaf.pl \ No newline at end of file diff --git a/backuper/backup_targets/file.py b/backuper/backup_targets/file.py index 0ffb372..f31ab3c 100644 --- a/backuper/backup_targets/file.py +++ b/backuper/backup_targets/file.py @@ -25,8 +25,8 @@ def __init__( def _backup(self) -> Path: out_file = core.get_new_backup_path(self.env_name, self.file.name) - shell_args = f"cp -f {self.file} {out_file}" - log.debug("start cp -f in subprocess: %s", shell_args) - core.run_subprocess(shell_args) - log.debug("finished cp -f, output: %s", out_file) + shell_create_file_symlink = f"ln -s {self.file} {out_file}" + log.debug("start ln in subprocess: %s", shell_create_file_symlink) + core.run_subprocess(shell_create_file_symlink) + log.debug("finished ln, output: %s", out_file) return out_file diff --git a/backuper/backup_targets/folder.py b/backuper/backup_targets/folder.py index 2331e5f..82ac808 100644 --- a/backuper/backup_targets/folder.py +++ b/backuper/backup_targets/folder.py @@ -25,8 +25,8 @@ def __init__( def _backup(self) -> Path: out_file = core.get_new_backup_path(self.env_name, self.folder.name) - shell_args = f"cp -r {self.folder} {out_file}" - log.debug("start cp -r in subprocess: %s", shell_args) - core.run_subprocess(shell_args) - log.debug("finished cp -r, output: %s", out_file) + shell_create_dir_symlink = f"ln -s {self.folder} {out_file}" + log.debug("start ln in subprocess: %s", shell_create_dir_symlink) + core.run_subprocess(shell_create_dir_symlink) + log.debug("finished ln, output: %s", out_file) return out_file diff --git a/backuper/backup_targets/mariadb.py b/backuper/backup_targets/mariadb.py index 145ca33..04883f9 100644 --- a/backuper/backup_targets/mariadb.py +++ b/backuper/backup_targets/mariadb.py @@ -107,11 +107,11 @@ def _backup(self) -> Path: name = f"{escaped_dbname}_{self.db_version}" out_file = core.get_new_backup_path(self.env_name, name, sql=True) db = shlex.quote(self.db) - shell_args = ( + shell_mariadb_dump_db = ( f"mariadb-dump --defaults-file={self.option_file} " f"--result-file={out_file} --verbose {db}" ) - log.debug("start mariadbdump in subprocess: %s", shell_args) - core.run_subprocess(shell_args) + log.debug("start mariadbdump in subprocess: %s", shell_mariadb_dump_db) + core.run_subprocess(shell_mariadb_dump_db) log.debug("finished mariadbdump, output: %s", out_file) return out_file diff --git a/backuper/backup_targets/mysql.py b/backuper/backup_targets/mysql.py index 51b5690..4aeaa22 100644 --- a/backuper/backup_targets/mysql.py +++ b/backuper/backup_targets/mysql.py @@ -108,11 +108,11 @@ def _backup(self) -> Path: out_file = core.get_new_backup_path(self.env_name, name, sql=True) db = shlex.quote(self.db) - shell_args = ( + shell_mysqldump_db = ( f"mysqldump --defaults-file={self.option_file} " f"--result-file={out_file} --verbose {db}" ) - log.debug("start mysqldump in subprocess: %s", shell_args) - core.run_subprocess(shell_args) + log.debug("start mysqldump in subprocess: %s", shell_mysqldump_db) + core.run_subprocess(shell_mysqldump_db) log.debug("finished mysqldump, output: %s", out_file) return out_file diff --git a/backuper/backup_targets/postgresql.py b/backuper/backup_targets/postgresql.py index bab00c2..f09dd90 100644 --- a/backuper/backup_targets/postgresql.py +++ b/backuper/backup_targets/postgresql.py @@ -124,8 +124,8 @@ def _backup(self) -> Path: escaped_dbname = core.safe_text_version(self.db) name = f"{escaped_dbname}_{self.db_version}" out_file = core.get_new_backup_path(self.env_name, name, sql=True) - shell_args = f"pg_dump -v -O -d {self.escaped_conn_uri} -f {out_file}" - log.debug("start pg_dump in subprocess: %s", shell_args) - core.run_subprocess(shell_args) + shell_pg_dump_db = f"pg_dump -v -O -d {self.escaped_conn_uri} -f {out_file}" + log.debug("start pg_dump in subprocess: %s", shell_pg_dump_db) + core.run_subprocess(shell_pg_dump_db) log.debug("finished pg_dump, output: %s", out_file) return out_file diff --git a/backuper/config.py b/backuper/config.py index addfb91..3471753 100644 --- a/backuper/config.py +++ b/backuper/config.py @@ -38,6 +38,7 @@ class BackupTargetEnum(StrEnum): LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO") BACKUP_PROVIDER = os.environ.get("BACKUP_PROVIDER", "") ZIP_ARCHIVE_PASSWORD = os.environ.get("ZIP_ARCHIVE_PASSWORD", "") +ZIP_SKIP_INTEGRITY_CHECK = os.environ.get("ZIP_SKIP_INTEGRITY_CHECK", "false") SUBPROCESS_TIMEOUT_SECS: int = int(os.environ.get("SUBPROCESS_TIMEOUT_SECS", 60 * 60)) SIGTERM_TIMEOUT_SECS: float = float(os.environ.get("SIGTERM_TIMEOUT_SECS", 30)) ZIP_ARCHIVE_LEVEL: int = int(os.environ.get("ZIP_ARCHIVE_LEVEL", 3)) diff --git a/backuper/core.py b/backuper/core.py index e54262d..eaf85c4 100644 --- a/backuper/core.py +++ b/backuper/core.py @@ -4,6 +4,7 @@ import re import secrets import shlex +import shutil import subprocess from datetime import datetime from pathlib import Path @@ -46,6 +47,14 @@ def run_subprocess(shell_args: str) -> str: return p.stdout +def remove_path(path: Path) -> None: + if path.exists(): + if path.is_file() or path.is_symlink(): + path.unlink() + else: + shutil.rmtree(path=path) + + def get_new_backup_path(env_name: str, name: str, sql: bool = False) -> Path: base_dir_path = config.CONST_BACKUP_FOLDER_PATH / env_name base_dir_path.mkdir(mode=0o700, exist_ok=True, parents=True) @@ -64,21 +73,34 @@ def get_new_backup_path(env_name: str, name: str, sql: bool = False) -> Path: def run_create_zip_archive(backup_file: Path) -> Path: seven_zip_path = seven_zip_bin_path() out_file = Path(f"{backup_file}.zip") - log.debug("run_create_zip_archive start creating in subprocess: %s", backup_file) + log.info("start creating zip archive in subprocess: %s", backup_file) zip_escaped_password = shlex.quote(config.ZIP_ARCHIVE_PASSWORD) - shell_args_create = ( + shell_create_7zip_archive = ( f"{seven_zip_path} a -p{zip_escaped_password} " f"-mx={config.ZIP_ARCHIVE_LEVEL} {out_file} {backup_file}" ) - run_subprocess(shell_args_create) - log.debug("run_create_zip_archive finished, output: %s", out_file) - - log.debug("run_create_zip_archive start integriy test in subprocess: %s", out_file) - shell_args_integriy = f"{seven_zip_path} t -p{zip_escaped_password} {out_file}" - integrity_check_result = run_subprocess(shell_args_integriy) + run_subprocess(shell_create_7zip_archive) + log.info("finished zip archive creating") + + if config.ZIP_SKIP_INTEGRITY_CHECK != "false": + return out_file + + log.info( + ( + "start zip archive integriy test on %s in subprocess, " + "you can control it using ZIP_SKIP_INTEGRITY_CHECK" + ), + out_file, + ) + shell_7zip_archive_integriy_check = ( + f"{seven_zip_path} t -p{zip_escaped_password} {out_file}" + ) + integrity_check_result = run_subprocess(shell_7zip_archive_integriy_check) if "Everything is Ok" not in integrity_check_result: # pragma: no cover - raise AssertionError("zip arichive integrity fatal error") - log.debug("run_create_zip_archive finish integriy test in subprocess: %s", out_file) + raise AssertionError( + "zip arichive integrity test on %s: %s", out_file, integrity_check_result + ) + log.info("finished zip archive integriy test") return out_file @@ -160,8 +182,8 @@ def create_provider_model() -> ProviderModel: def seven_zip_bin_path() -> Path: - shell_args_cpu_architecture = "dpkg --print-architecture" - cpu_arch = run_subprocess(shell_args_cpu_architecture).strip() + shell_get_cpu_architecture = "dpkg --print-architecture" + cpu_arch = run_subprocess(shell_get_cpu_architecture).strip() seven_zip = config.CONST_BIN_ZIP_PATH / f"{cpu_arch}/7zz" if not seven_zip.exists(): # pragma: no cover raise RuntimeError( diff --git a/backuper/main.py b/backuper/main.py index 39a1c8c..40577f9 100644 --- a/backuper/main.py +++ b/backuper/main.py @@ -137,7 +137,7 @@ def run_backup(target: BaseBackupTarget, provider: BaseUploadProvider) -> None: env_name=target.env_name, send_on_success=True, ): - provider.safe_clean(backup_file=backup_file, max_backups=target.max_backups) + provider.clean(backup_file=backup_file, max_backups=target.max_backups) log.info( "backup and upload finished, next backup of target `%s` is: %s", diff --git a/backuper/upload_providers/base_provider.py b/backuper/upload_providers/base_provider.py index 8b64ed4..01ea96d 100644 --- a/backuper/upload_providers/base_provider.py +++ b/backuper/upload_providers/base_provider.py @@ -24,7 +24,7 @@ def post_save(self, backup_file: Path) -> str: raise @final - def safe_clean(self, backup_file: Path, max_backups: int) -> None: + def clean(self, backup_file: Path, max_backups: int) -> None: try: return self._clean(backup_file=backup_file, max_backups=max_backups) except Exception as err: diff --git a/backuper/upload_providers/debug.py b/backuper/upload_providers/debug.py index 4e49001..7f287cf 100644 --- a/backuper/upload_providers/debug.py +++ b/backuper/upload_providers/debug.py @@ -1,5 +1,4 @@ import logging -import shutil from pathlib import Path from backuper import config, core @@ -25,10 +24,7 @@ def _post_save(self, backup_file: Path) -> str: return str(zip_file) def _clean(self, backup_file: Path, max_backups: int) -> None: - if backup_file.is_file(): - backup_file.unlink() - else: - shutil.rmtree(backup_file) + core.remove_path(backup_file) files: list[str] = [] for backup_path in backup_file.parent.iterdir(): files.append(str(backup_path.absolute())) @@ -36,10 +32,7 @@ def _clean(self, backup_file: Path, max_backups: int) -> None: while len(files) > max_backups: backup_to_remove = Path(files.pop()) try: - if backup_to_remove.is_file(): - backup_to_remove.unlink() - else: - shutil.rmtree(backup_to_remove) + core.remove_path(backup_to_remove) log.info("removed path %s", backup_to_remove) except Exception as e: # pragma: no cover log.error( diff --git a/backuper/upload_providers/google_cloud_storage.py b/backuper/upload_providers/google_cloud_storage.py index 4628032..110a25e 100644 --- a/backuper/upload_providers/google_cloud_storage.py +++ b/backuper/upload_providers/google_cloud_storage.py @@ -1,7 +1,6 @@ import base64 import logging import os -import shutil from pathlib import Path import google.cloud.storage as storage @@ -63,10 +62,7 @@ def _post_save(self, backup_file: Path) -> str: def _clean(self, backup_file: Path, max_backups: int) -> None: for backup_path in backup_file.parent.iterdir(): - if backup_path.is_dir(): - shutil.rmtree(backup_path) - else: - backup_path.unlink() + core.remove_path(backup_path) log.info("removed %s from local disk", backup_path) backup_list_cloud: list[str] = [] diff --git a/docs/configuration.md b/docs/configuration.md index 7a8e79b..a18c274 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -12,7 +12,7 @@ Environemt variables | ZIP_ARCHIVE_PASSWORD | string[**required**] | Zip archive password that all backups generated by this backuper instance will have. When it is lost, you lose access to your backups. | - | | BACKUP_PROVIDER | string[**required**] | See `Providers` chapter, choosen backup provider for example [GCS](./providers/google_cloud_storage.md). | - | | BACKUP_MAX_NUMBER | int | How many backups can live at once for backup target. Defaults to `7`. Note this must makes sense with cron expression you use. For example if you want to have 7 day retention, and make backups at 5:00, BACKUP_MAX_NUMBER=7 is fine, but if you make 4 backups per day, you will need BACKUP_MAX_NUMBER=28. | 7 | -| ROOT_MODE | bool | If false, process in container will start backuper using user with minimal permissions required. If `true` or anything different than `false`, it will run as root (it may help for example with file/directory backup permission issues in mounted volumes). | false | +| ROOT_MODE | bool | If false, process in container will start backuper using user with minimal permissions required. If `true`, it will run as root (it may help for example with file/directory backup permission issues in mounted volumes). | false | | POSTGRESQL\_... | backup target syntax | PostgreSQL database target, see [PostgreSQL](./backup_targets/postgresql.md). | - | | MYSQL\_... | backup target syntax | MySQL database target, see [MySQL](./backup_targets/mysql.md). | - | | MARIADB\_... | backup target syntax | MariaDB database target, see [MariaDB](./backup_targets/mariadb.md). | - | @@ -26,6 +26,7 @@ Environemt variables | ZIP_ARCHIVE_LEVEL | int[1-9] | Compression level of 7-zip via `-mx` option: `-mx[N] : set compression level: -mx1 (fastest) ... -mx9 (ultra)`. Defaults to `3` which should be sufficient and fast enough. | 3 | | LOG_FOLDER_PATH | string | Path to store log files, for local development `./logs`, in container `/var/log/backuper`. | /var/log/backuper | | SIGTERM_TIMEOUT_SECS | int | Time in seconds on exit how long backuper will wait for ongoing backup threads before force killing them and exiting. | 30 | +| ZIP_SKIP_INTEGRITY_CHECK | bool | By default after 7zip archive is created, integrity check runs on it. You can opt out this behaviour for performance reasons, use `true`. | false |

diff --git a/pyproject.toml b/pyproject.toml index 3c80e66..7c69fee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ authors = ["RafaƂ Safin "] description = "A tool for performing scheduled database backups and transferring encrypted data to secure clouds, for home labs, hobby projects, etc., in environments such as k8s, docker, vms." license = "GNU GPLv3" name = "backuper" -version = "3.1" +version = "3.2" [tool.poetry.dependencies] croniter = "^1.3.7" diff --git a/tests/test_core.py b/tests/test_core.py index b16fe0f..2a52ab1 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -61,7 +61,11 @@ def test_get_new_backup_path_sql(caplog: LogCaptureFixture) -> None: assert caplog.messages == [] -def test_run_create_zip_archive(tmp_path: Path) -> None: +@pytest.mark.parametrize("integrity", ["false", "true"]) +def test_run_create_zip_archive( + tmp_path: Path, integrity: str, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setattr(config, "ZIP_SKIP_INTEGRITY_CHECK", integrity) fake_backup_file = tmp_path / "fake_backup" with open(fake_backup_file, "w") as f: f.write("abcdefghijk\n12345") diff --git a/tests/test_main.py b/tests/test_main.py index d3bee2f..49733aa 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -121,7 +121,7 @@ def test_main(monkeypatch: pytest.MonkeyPatch) -> None: @pytest.mark.parametrize( - "make_backup_side_effect,post_save_side_effect,safe_clean_side_effect", + "make_backup_side_effect,post_save_side_effect,clean_side_effect", [ (ValueError(), None, None), (None, ValueError(), None), @@ -131,7 +131,7 @@ def test_main(monkeypatch: pytest.MonkeyPatch) -> None: def test_run_backup_notifications_fail_message_is_fired_when_it_fails( make_backup_side_effect: Any | None, post_save_side_effect: Any | None, - safe_clean_side_effect: Any | None, + clean_side_effect: Any | None, monkeypatch: pytest.MonkeyPatch, ) -> None: fail_message_mock = Mock() @@ -151,7 +151,7 @@ def test_run_backup_notifications_fail_message_is_fired_when_it_fails( monkeypatch.setattr(target, "_backup", backup_mock) provider = UploadProviderLocalDebug() monkeypatch.setattr(provider, "_post_save", Mock(side_effect=post_save_side_effect)) - monkeypatch.setattr(provider, "_clean", Mock(side_effect=safe_clean_side_effect)) + monkeypatch.setattr(provider, "_clean", Mock(side_effect=clean_side_effect)) with pytest.raises(ValueError): main.run_backup(target=target, provider=provider) diff --git a/tests/test_storage_provider_gcs.py b/tests/test_storage_provider_gcs.py index 4b35831..10b3e82 100644 --- a/tests/test_storage_provider_gcs.py +++ b/tests/test_storage_provider_gcs.py @@ -92,7 +92,7 @@ def __init__(self, blob_name: str) -> None: ] -@pytest.mark.parametrize("gcs_method_name", ["_clean", "safe_clean"]) +@pytest.mark.parametrize("gcs_method_name", ["_clean", "clean"]) def test_gcs_clean_file_and_short_blob_list( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, gcs_method_name: str ) -> None: @@ -128,7 +128,7 @@ def test_gcs_clean_file_and_short_blob_list( single_blob_mock.delete.assert_called_once_with() -@pytest.mark.parametrize("gcs_method_name", ["_clean", "safe_clean"]) +@pytest.mark.parametrize("gcs_method_name", ["_clean", "clean"]) def test_gcs_clean_directory_and_long_blob_list( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, gcs_method_name: str ) -> None: diff --git a/tests/test_storage_provider_local.py b/tests/test_storage_provider_local.py index 7d6d805..0134958 100644 --- a/tests/test_storage_provider_local.py +++ b/tests/test_storage_provider_local.py @@ -5,7 +5,7 @@ from backuper.upload_providers import UploadProviderLocalDebug -@pytest.mark.parametrize("method_name", ["_clean", "safe_clean"]) +@pytest.mark.parametrize("method_name", ["_clean", "clean"]) def test_gcs_clean_file( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, method_name: str ) -> None: @@ -30,7 +30,7 @@ def test_gcs_clean_file( assert fake_backup_file_zip_path4.exists() -@pytest.mark.parametrize("method_name", ["_clean", "safe_clean"]) +@pytest.mark.parametrize("method_name", ["_clean", "clean"]) def test_gcs_clean_folder( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, method_name: str ) -> None: