From a21ea5d88f597e09aaeebea1e96a5a57fd2a76ce Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Fri, 11 Dec 2020 00:34:11 +0100
Subject: [PATCH] create --sparse, file map support for the "fixed" chunker,
 see #14

a file map can be:

- created internally inside chunkify by calling sparsemap, which uses
  SEEK_DATA / SEEK_HOLE to determine data and hole ranges inside a
  seekable sparse file.
  Usage: borg create --sparse --chunker-params=fixed,BLOCKSIZE ...
  BLOCKSIZE is the chunker blocksize here, not the filesystem blocksize!

- made by some other means and given to the chunkify function.
  this is not used yet, but in future this could be used to only read
  the changed parts and seek over the (known) unchanged parts of a file.

sparsemap: the generate range sizes are multiples of the fs block size.
           the tests assume 4kiB fs block size.
---
 setup.py                             |   2 +-
 src/borg/archive.py                  |   4 +-
 src/borg/archiver.py                 |   4 +-
 src/borg/chunker.pyx                 | 172 +++++++++++++++++++++------
 src/borg/testsuite/chunker_pytest.py | 120 +++++++++++++++++++
 5 files changed, 260 insertions(+), 42 deletions(-)
 create mode 100644 src/borg/testsuite/chunker_pytest.py

diff --git a/setup.py b/setup.py
index 4152f9da2be..db029d288a5 100644
--- a/setup.py
+++ b/setup.py
@@ -297,5 +297,5 @@ def members_appended(*ds):
     setup_requires=['setuptools_scm>=1.7'],
     install_requires=install_requires,
     extras_require=extras_require,
-    python_requires='>=3.6',
+    python_requires='>=3.5',
 )
diff --git a/src/borg/archive.py b/src/borg/archive.py
index fa0c7d7e635..c7d3e8db56e 100644
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -1167,7 +1167,7 @@ class FilesystemObjectProcessors:
 
     def __init__(self, *, metadata_collector, cache, key,
                  add_item, process_file_chunks,
-                 chunker_params, show_progress):
+                 chunker_params, show_progress, sparse):
         self.metadata_collector = metadata_collector
         self.cache = cache
         self.key = key
@@ -1178,7 +1178,7 @@ def __init__(self, *, metadata_collector, cache, key,
         self.hard_links = {}
         self.stats = Statistics()  # threading: done by cache (including progress)
         self.cwd = os.getcwd()
-        self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed)
+        self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=sparse)
 
     @contextmanager
     def create_helper(self, path, st, status=None, hardlinkable=True):
diff --git a/src/borg/archiver.py b/src/borg/archiver.py
index 029776641a5..a8590ab9aa1 100644
--- a/src/borg/archiver.py
+++ b/src/borg/archiver.py
@@ -653,7 +653,7 @@ def create_inner(archive, cache, fso):
                     checkpoint_interval=args.checkpoint_interval, rechunkify=False)
                 fso = FilesystemObjectProcessors(metadata_collector=metadata_collector, cache=cache, key=key,
                     process_file_chunks=cp.process_file_chunks, add_item=archive.add_item,
-                    chunker_params=args.chunker_params, show_progress=args.progress)
+                    chunker_params=args.chunker_params, show_progress=args.progress, sparse=args.sparse)
                 create_inner(archive, cache, fso)
         else:
             create_inner(None, None, None)
@@ -3341,6 +3341,8 @@ def define_borg_mount(parser):
                               help='deprecated, use ``--noflags`` instead')
         fs_group.add_argument('--noflags', dest='noflags', action='store_true',
                               help='do not read and store flags (e.g. NODUMP, IMMUTABLE) into archive')
+        fs_group.add_argument('--sparse', dest='sparse', action='store_true',
+                               help='detect sparse holes in input (supported only by fixed chunker)')
         fs_group.add_argument('--files-cache', metavar='MODE', dest='files_cache_mode',
                               type=FilesCacheMode, default=DEFAULT_FILES_CACHE_MODE_UI,
                               help='operate files cache in MODE. default: %s' % DEFAULT_FILES_CACHE_MODE_UI)
diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx
index 68f9c010e29..90d73c3cfdf 100644
--- a/src/borg/chunker.pyx
+++ b/src/borg/chunker.pyx
@@ -2,6 +2,7 @@
 
 API_VERSION = '1.2_01'
 
+import errno
 import os
 
 from libc.stdlib cimport free
@@ -19,11 +20,85 @@ cdef extern from "_chunker.c":
     uint32_t c_buzhash_update  "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h)
 
 
+def dread(offset, size, fd=None, fh=-1):
+    use_fh = fh >= 0
+    if use_fh:
+        data = os.read(fh, size)
+        if hasattr(os, 'posix_fadvise'):
+            # UNIX only and, in case of block sizes that are not a multiple of the
+            # system's page size, better be used with a bug fixed linux kernel > 4.6.0,
+            # see comment/workaround in _chunker.c and borgbackup issue #907.
+            os.posix_fadvise(fh, offset, len(data), os.POSIX_FADV_DONTNEED)
+        return data
+    else:
+        return fd.read(size)
+
+
+def dseek(amount, whence, fd=None, fh=-1):
+    use_fh = fh >= 0
+    if use_fh:
+        return os.lseek(fh, amount, whence)
+    else:
+        return fd.seek(amount, whence)
+
+
+def dpos_curr_end(fd=None, fh=-1):
+    # return current position, file end position (== file length)
+    curr = dseek(0, os.SEEK_CUR, fd, fh)
+    end = dseek(0, os.SEEK_END, fd, fh)
+    dseek(curr, os.SEEK_SET, fd, fh)
+    return curr, end
+
+
+def sparsemap(fd=None, fh=-1):
+    """
+    generator yielding (start, length, type) tuples,
+    indicating data (True) and hole (False) ranges inside the file.
+
+    note:
+    the map is generated starting from the current seek position (it
+    is not required to be 0 / to be at the start of the file) and
+    work from there up to the end of the file.
+    when the generator is finished, the file pointer position will be
+    reset to where it was before calling this function.
+    """
+    curr, file_len = dpos_curr_end(fd, fh)  # start is the CURRENT position now.
+    start = curr
+    try:
+        whence = os.SEEK_HOLE
+        while True:
+            is_data = whence == os.SEEK_HOLE  # True: range with data, False: range is a hole
+            try:
+                end = dseek(start, whence, fd, fh)
+            except OSError as e:
+                if e.errno == errno.ENXIO:
+                    if not is_data and start < file_len:
+                        # if there is only sparse space at the end of a file, we can not
+                        # find the file end by SEEK_DATA (because run into ENXIO), thus
+                        # we must manually deal with this case:
+                        end = file_len
+                        yield (start, end - start, is_data)
+                    break
+                else:
+                    raise
+            # we do not want to yield zero-length ranges with start == end:
+            if end > start:
+                yield (start, end - start, is_data)
+            start = end
+            whence = os.SEEK_DATA if is_data else os.SEEK_HOLE
+    finally:
+        # seek to same position as before calling this function
+        dseek(curr, os.SEEK_SET, fd, fh)
+
+
 class ChunkerFixed:
     """
-    Fixed blocksize Chunker, optionally supporting a header block of different size.
+    Fixed blocksize Chunker, optionally supporting:
 
-    This is a very simple chunker for input data with known block/record sizes:
+    - a header block of different size
+    - using a sparsemap to only read ranges with data
+
+    This is a simple chunker for input data with known block/record sizes:
 
     - raw disk images
     - block devices
@@ -32,11 +107,15 @@ class ChunkerFixed:
     Note: the last block of the input data may be less than the block size,
           this is supported and not considered to be an error.
     """
-    def __init__(self, block_size, header_size=0):
+    def __init__(self, block_size, header_size=0, sparse=False):
         self.block_size = block_size
         self.header_size = header_size
+        # should borg try to do sparse input processing?
+        # whether it actually can be done depends on the input file being seekable.
+        self.try_sparse = sparse and hasattr(os, 'SEEK_DATA') and hasattr(os, 'SEEK_HOLE')
+        self.zeros = memoryview(bytes(block_size)) if self.try_sparse else None
 
-    def chunkify(self, fd, fh=-1):
+    def chunkify(self, fd=None, fh=-1, fmap=None):
         """
         Cut a file into chunks.
 
@@ -44,40 +123,56 @@ class ChunkerFixed:
         :param fh: OS-level file handle (if available),
                    defaults to -1 which means not to use OS-level fd.
         """
+        if fmap is None:
+            if self.try_sparse:
+                try:
+                    if self.header_size > 0:
+                        header_map = [(0, self.header_size, True), ]
+                        dseek(self.header_size, os.SEEK_SET, fd, fh)
+                        body_map = list(sparsemap(fd, fh))
+                        dseek(0, os.SEEK_SET, fd, fh)
+                    else:
+                        header_map = []
+                        body_map = list(sparsemap(fd, fh))
+                except OSError as err:
+                    # seeking in sparsemap did not work
+                    pass
+                else:
+                    fmap = header_map + body_map
+
+            if fmap is None:
+                # either sparse processing (building the fmap) was not tried or it failed.
+                # in these cases, we just build a "fake fmap" that considers the whole file
+                # as range(s) of data (no holes), so we can use the same code.
+                # we build different fmaps here for the purpose of correct block alignment
+                # with or without a header block (of potentially different size).
+                if self.header_size > 0:
+                    header_map = [(0, self.header_size, True), ]
+                    body_map = [(self.header_size, 2 ** 62, True), ]
+                else:
+                    header_map = []
+                    body_map = [(0, 2 ** 62, True), ]
+                fmap = header_map + body_map
+
         offset = 0
-        use_fh = fh >= 0
-
-        if use_fh:
-            def read(size):
-                nonlocal offset
-                data = os.read(fh, size)
-                amount = len(data)
-                if hasattr(os, 'posix_fadvise'):
-                    # UNIX only and, in case of block sizes that are not a multiple of the
-                    # system's page size, better be used with a bug fixed linux kernel > 4.6.0,
-                    # see comment/workaround in _chunker.c and borgbackup issue #907.
-                    os.posix_fadvise(fh, offset, amount, os.POSIX_FADV_DONTNEED)
-                offset += amount
-                return data
-        else:
-            def read(size):
-                nonlocal offset
-                data = fd.read(size)
-                amount = len(data)
-                offset += amount
-                return data
-
-        if self.header_size > 0:
-            data = read(self.header_size)
-            if data:
-                yield data
-        else:
-            data = True  # get into next while loop
-        while data:
-            data = read(self.block_size)
-            if data:
-                yield data
-        # empty data means we are at EOF and we terminate the generator.
+        for range_start, range_size, is_data in fmap:
+            while range_size:
+                wanted = min(range_size, self.block_size)
+                if is_data:
+                    # read blocks from the range with the desired read_size, if possible
+                    data = dread(offset, wanted, fd, fh)
+                else:  # hole
+                    # seek over blocks from the range with the desired read_size, if possible
+                    pos = dseek(wanted, os.SEEK_CUR, fd, fh)
+                    data = self.zeros[:pos - offset]  # for now, create zero-bytes here
+                got = len(data)
+                if got > 0:
+                    offset += got
+                    range_size -= got
+                    yield data  # later, use a better api that tags data vs. sparse
+                if got < wanted:
+                    # we did not get enough data, looks like early EOF.
+                    return
 
 
 cdef class Chunker:
@@ -129,7 +224,8 @@ def get_chunker(algo, *params, **kw):
         seed = kw['seed']
         return Chunker(seed, *params)
     if algo == 'fixed':
-        return ChunkerFixed(*params)
+        sparse = kw['sparse']
+        return ChunkerFixed(*params, sparse=sparse)
     raise TypeError('unsupported chunker algo %r' % algo)
 
 
diff --git a/src/borg/testsuite/chunker_pytest.py b/src/borg/testsuite/chunker_pytest.py
new file mode 100644
index 00000000000..89e12053534
--- /dev/null
+++ b/src/borg/testsuite/chunker_pytest.py
@@ -0,0 +1,120 @@
+from io import BytesIO
+import os
+
+import pytest
+
+from ..chunker import ChunkerFixed, sparsemap
+from ..constants import *  # NOQA
+
+BS = 4096  # fs block size
+
+# some sparse files. X = content blocks, _ = sparse blocks.
+# X__XXX____
+map_sparse1 = [
+    (0 * BS, 1 * BS, True),
+    (1 * BS, 2 * BS, False),
+    (3 * BS, 3 * BS, True),
+    (6 * BS, 4 * BS, False),
+]
+
+# _XX___XXXX
+map_sparse2 = [
+    (0 * BS, 1 * BS, False),
+    (1 * BS, 2 * BS, True),
+    (3 * BS, 3 * BS, False),
+    (6 * BS, 4 * BS, True),
+]
+
+# XXX
+map_notsparse = [(0 * BS, 3 * BS, True), ]
+
+# ___
+map_onlysparse = [(0 * BS, 3 * BS, False), ]
+
+
+def make_sparsefile(fname, sparsemap, header_size=0):
+    with open(fname, 'wb') as fd:
+        total = 0
+        if header_size:
+            fd.write(b'H' * header_size)
+            total += header_size
+        for offset, size, is_data in sparsemap:
+            if is_data:
+                fd.write(b'X' * size)
+            else:
+                fd.seek(size, os.SEEK_CUR)
+            total += size
+        fd.truncate(total)
+    assert os.path.getsize(fname) == total
+
+
+def make_content(sparsemap, header_size=0):
+    with BytesIO() as fd:
+        total = 0
+        if header_size:
+            fd.write(b'H' * header_size)
+            total += header_size
+        for offset, size, is_data in sparsemap:
+            if is_data:
+                fd.write(b'X' * size)
+            else:
+                fd.write(b'\0' * size)
+            total += size
+        content = fd.getvalue()
+    assert len(content) == total
+    return content
+
+
+@pytest.mark.parametrize("fname, sparse_map", [
+    ('sparse1', map_sparse1),
+    ('sparse2', map_sparse2),
+    ('onlysparse', map_onlysparse),
+    ('notsparse', map_notsparse),
+])
+def test_sparsemap(tmpdir, fname, sparse_map):
+
+    def get_sparsemap_fh(fname):
+        fh = os.open(fname, flags=os.O_RDONLY)
+        try:
+            return list(sparsemap(fh=fh))
+        finally:
+            os.close(fh)
+
+    def get_sparsemap_fd(fname):
+        with open(fname, 'rb') as fd:
+            return list(sparsemap(fd=fd))
+
+    fn = str(tmpdir / fname)
+    make_sparsefile(fn, sparse_map)
+    assert get_sparsemap_fh(fn) == sparse_map
+    assert get_sparsemap_fd(fn) == sparse_map
+
+
+@pytest.mark.parametrize("fname, sparse_map, header_size, sparse", [
+    ('sparse1', map_sparse1, 0, False),
+    ('sparse1', map_sparse1, 0, True),
+    ('sparse1', map_sparse1, BS, False),
+    ('sparse1', map_sparse1, BS, True),
+    ('sparse2', map_sparse2, 0, False),
+    ('sparse2', map_sparse2, 0, True),
+    ('sparse2', map_sparse2, BS, False),
+    ('sparse2', map_sparse2, BS, True),
+    ('onlysparse', map_onlysparse, 0, False),
+    ('onlysparse', map_onlysparse, 0, True),
+    ('onlysparse', map_onlysparse, BS, False),
+    ('onlysparse', map_onlysparse, BS, True),
+    ('notsparse', map_notsparse, 0, False),
+    ('notsparse', map_notsparse, 0, True),
+    ('notsparse', map_notsparse, BS, False),
+    ('notsparse', map_notsparse, BS, True),
+])
+def test_chunkify_sparse(tmpdir, fname, sparse_map, header_size, sparse):
+
+    def get_chunks(fname, sparse, header_size):
+        chunker = ChunkerFixed(4096, header_size=header_size, sparse=sparse)
+        with open(fname, 'rb') as fd:
+            return b''.join([c for c in chunker.chunkify(fd)])
+
+    fn = str(tmpdir / fname)
+    make_sparsefile(fn, sparse_map, header_size=header_size)
+    get_chunks(fn, sparse=sparse, header_size=header_size) == make_content(sparse_map, header_size=header_size)