From 85805ff3d2910c0b45e5c9151a3c2215860c9c1e Mon Sep 17 00:00:00 2001 From: PerseoGI Date: Fri, 7 Feb 2025 15:42:06 +0100 Subject: [PATCH] Fix 200% untar progress issue by avoiding a second read of file (#17708) * Fix 200% untar progress issue by avoiding a second read of file * Fix test cases and updated exception message * Updated FileProgress interval to 10s --- conan/tools/files/files.py | 46 ++++++++++--------- conans/client/rest/file_uploader.py | 2 +- test/unittests/tools/files/test_zipping.py | 2 +- .../util/files/strip_root_extract_test.py | 17 ++++++- 4 files changed, 43 insertions(+), 24 deletions(-) diff --git a/conan/tools/files/files.py b/conan/tools/files/files.py index 433c30e2de9..66a2a43a999 100644 --- a/conan/tools/files/files.py +++ b/conan/tools/files/files.py @@ -360,33 +360,37 @@ def untargz(filename, destination=".", pattern=None, strip_root=False, extract_f f = getattr(tarfile, f"{extract_filter}_filter", None) if extract_filter else None tarredgzippedFile.extraction_filter = f or (lambda member_, _: member_) if not pattern and not strip_root: + # Simple case: extract everything tarredgzippedFile.extractall(destination) else: - members = tarredgzippedFile.getmembers() - - if pattern: - members = list(filter(lambda m: fnmatch(m.name, pattern), - tarredgzippedFile.getmembers())) - if strip_root: - names = [member.name.replace("\\", "/") for member in members] - common_folder = os.path.commonprefix(names).split("/", 1)[0] - if not common_folder and len(names) > 1: - raise ConanException("The tgz file contains more than 1 folder in the root") - if len(names) == 1 and len(names[0].split("/", 1)) == 1: - raise ConanException("The tgz file contains a file in the root") - # Remove the directory entry if present - members = [m for m in members if m.name != common_folder] - for member in members: + # Read members one by one and process them + common_folder = None + for member in tarredgzippedFile: + if pattern and not fnmatch(member.name, pattern): + continue # Skip files that don’t match the pattern + + if strip_root: name = member.name.replace("\\", "/") - member.name = name.split("/", 1)[1] + if not common_folder: + splits = name.split("/", 1) + # First case for a plain folder in the root + if member.isdir() or len(splits) > 1: + common_folder = splits[0] # Find the root folder + else: + raise ConanException("Can't untar a tgz containing files in the root with strip_root enabled") + if not name.startswith(common_folder): + raise ConanException("The tgz file contains more than 1 folder in the root") + + # Adjust the member's name for extraction + member.name = name[len(common_folder) + 1:] member.path = member.name - if member.linkpath.startswith(common_folder): + if member.linkpath and member.linkpath.startswith(common_folder): # https://github.com/conan-io/conan/issues/11065 - linkpath = member.linkpath.replace("\\", "/") - member.linkpath = linkpath.split("/", 1)[1] + member.linkpath = member.linkpath[len(common_folder) + 1:].replace("\\", "/") member.linkname = member.linkpath - tarredgzippedFile.extractall(destination, members=members) - + # Extract file one by one, avoiding `extractall()` with members parameter: + # This will avoid a first whole file read resulting in a performant improvement around 25% + tarredgzippedFile.extract(member, path=destination) def check_sha1(conanfile, file_path, signature): """ diff --git a/conans/client/rest/file_uploader.py b/conans/client/rest/file_uploader.py index 0b831d2da51..5005a5c14c3 100644 --- a/conans/client/rest/file_uploader.py +++ b/conans/client/rest/file_uploader.py @@ -97,7 +97,7 @@ def _upload_file(self, url, abs_path, headers, auth, ref): class FileProgress(io.FileIO): - def __init__(self, path: str, msg: str = "Uploading", interval: float = 1, *args, **kwargs): + def __init__(self, path: str, msg: str = "Uploading", interval: float = 10, *args, **kwargs): super().__init__(path, *args, **kwargs) self._size = os.path.getsize(path) self._filename = os.path.basename(path) diff --git a/test/unittests/tools/files/test_zipping.py b/test/unittests/tools/files/test_zipping.py index 0c41a494d33..655271ad5af 100644 --- a/test/unittests/tools/files/test_zipping.py +++ b/test/unittests/tools/files/test_zipping.py @@ -137,7 +137,7 @@ def test_untargz_with_strip_root_fails(): dest_dir = temp_folder() with pytest.raises(ConanException) as error: unzip(conanfile, archive, dest_dir, strip_root=True) - assert "The tgz file contains more than 1 folder in the root" in str(error.value) + assert "Can't untar a tgz containing files in the root with strip_root enabled" in str(error.value) def test_untargz_with_strip_root_and_pattern(): diff --git a/test/unittests/util/files/strip_root_extract_test.py b/test/unittests/util/files/strip_root_extract_test.py index 3f8746b9925..d65bcb36788 100644 --- a/test/unittests/util/files/strip_root_extract_test.py +++ b/test/unittests/util/files/strip_root_extract_test.py @@ -255,5 +255,20 @@ def test_invalid_flat_single_file(self): # Extract without the subfolder extract_folder = temp_folder() - with self.assertRaisesRegex(ConanException, "The tgz file contains a file in the root"): + with self.assertRaisesRegex(ConanException, "Can't untar a tgz containing files in the root with strip_root enabled"): + unzip(ConanFileMock(), tgz_file, destination=extract_folder, strip_root=True) + + def test_invalid_flat_multiple_file(self): + tmp_folder = temp_folder() + with chdir(tmp_folder): + save("file1", "contentsfile1") + save("file2", "contentsfile2") + + zip_folder = temp_folder() + tgz_file = os.path.join(zip_folder, "file.tar.gz") + self._compress_folder(tmp_folder, tgz_file) + + # Extract without the subfolder + extract_folder = temp_folder() + with self.assertRaisesRegex(ConanException, "Can't untar a tgz containing files in the root with strip_root enabled"): unzip(ConanFileMock(), tgz_file, destination=extract_folder, strip_root=True)