Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions man/ostree.repo-config.xml
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,18 @@ Boston, MA 02111-1307, USA.
</listitem>
</varlistentry>

<varlistentry>
<term><varname>per-object-fsync</varname></term>
<listitem><para>By default, OSTree will batch fsync() after
writing everything; however, this can cause latency spikes
for other processes which are also invoking fsync().
Turn on this boolean to reduce potential latency spikes,
at the cost of slowing down OSTree updates. You most
likely want this on by default for "background" OS updates.
</para>
</listitem>
</varlistentry>

<varlistentry>
<term><varname>min-free-space-percent</varname></term>
<listitem>
Expand Down
113 changes: 75 additions & 38 deletions src/libostree/ostree-repo-commit.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,33 @@
#define FICLONE _IOW(0x94, 9, int)
#endif


/* If fsync is enabled and we're in a txn, we write into a staging dir for
* commit, but we also allow direct writes into objects/ for e.g. hardlink
* imports.
/* Understanding ostree's fsync strategy
*
* A long time ago, ostree used to invoke fsync() on each object,
* then move it into the objects directory. However, it turned
* out to be a *lot* faster to write the objects into a separate "staging"
* directory (letting the filesystem handle writeback how it likes)
* and then only walk over each of the files, fsync(), then rename()
* into place. See also https://lwn.net/Articles/789024/
*
* (We also support a "disable fsync entirely" mode, where you don't
* care about integrity; e.g. test suites using disposable VMs).
*
* This "delayed fsync" pattern though is much worse for other concurrent processes
* like databases because it forces a lot to go through the filesystem
* journal at once once we do the sync. So now we support a `per_object_fsync`
* option that again invokes `fsync()` directly. This also notably
* provides "backpressure", ensuring we aren't queuing up a huge amount
* of I/O at once.
*/

/* The directory where we place content */
static int
commit_dest_dfd (OstreeRepo *self)
{
if (self->in_transaction && !self->disable_fsync)
if (self->per_object_fsync)
return self->objects_dir_fd;
else if (self->in_transaction && !self->disable_fsync)
return self->commit_stagedir.fd;
else
return self->objects_dir_fd;
Expand Down Expand Up @@ -420,7 +438,7 @@ commit_loose_regfile_object (OstreeRepo *self,
/* Ensure that in case of a power cut, these files have the data we
* want. See http://lwn.net/Articles/322823/
*/
if (!self->in_transaction && !self->disable_fsync)
if (!self->disable_fsync && self->per_object_fsync)
{
if (fsync (tmpf->fd) == -1)
return glnx_throw_errno_prefix (error, "fsync");
Expand Down Expand Up @@ -1835,6 +1853,52 @@ ostree_repo_prepare_transaction (OstreeRepo *self,
return TRUE;
}

/* Synchronize the directories holding the objects */
static gboolean
fsync_object_dirs (OstreeRepo *self,
GCancellable *cancellable,
GError **error)
{
GLNX_AUTO_PREFIX_ERROR ("fsync objdirs", error);
g_auto(GLnxDirFdIterator) dfd_iter = { 0, };

if (self->disable_fsync)
return TRUE; /* No fsync? Nothing to do then. */

if (!glnx_dirfd_iterator_init_at (self->objects_dir_fd, ".", FALSE, &dfd_iter, error))
return FALSE;
while (TRUE)
{
struct dirent *dent;
if (!glnx_dirfd_iterator_next_dent_ensure_dtype (&dfd_iter, &dent, cancellable, error))
return FALSE;
if (dent == NULL)
break;
if (dent->d_type != DT_DIR)
continue;
/* All object directories only have two character entries */
if (strlen (dent->d_name) != 2)
continue;

glnx_autofd int target_dir_fd = -1;
if (!glnx_opendirat (self->objects_dir_fd, dent->d_name, FALSE,
&target_dir_fd, error))
return FALSE;
/* This synchronizes the directory to ensure all the objects we wrote
* are there. We need to do this before removing the .commitpartial
* stamp (or have a ref point to the commit).
*/
if (fsync (target_dir_fd) == -1)
return glnx_throw_errno_prefix (error, "fsync");
}

/* In case we created any loose object subdirs, make sure they are on disk */
if (fsync (self->objects_dir_fd) == -1)
return glnx_throw_errno_prefix (error, "fsync");

return TRUE;
}

/* Called for commit, to iterate over the "staging" directory and rename all the
* objects into the primary objects/ location. Notably this is called only after
* syncfs() has potentially been invoked to ensure that all objects have been
Expand All @@ -1856,10 +1920,6 @@ rename_pending_loose_objects (OstreeRepo *self,
while (TRUE)
{
struct dirent *dent;
gboolean renamed_some_object = FALSE;
g_auto(GLnxDirFdIterator) child_dfd_iter = { 0, };
char loose_objpath[_OSTREE_LOOSE_PATH_MAX];

if (!glnx_dirfd_iterator_next_dent_ensure_dtype (&dfd_iter, &dent, cancellable, error))
return FALSE;
if (dent == NULL)
Expand All @@ -1872,10 +1932,12 @@ rename_pending_loose_objects (OstreeRepo *self,
if (strlen (dent->d_name) != 2)
continue;

g_auto(GLnxDirFdIterator) child_dfd_iter = { 0, };
if (!glnx_dirfd_iterator_init_at (dfd_iter.fd, dent->d_name, FALSE,
&child_dfd_iter, error))
return FALSE;

char loose_objpath[_OSTREE_LOOSE_PATH_MAX];
loose_objpath[0] = dent->d_name[0];
loose_objpath[1] = dent->d_name[1];
loose_objpath[2] = '/';
Expand All @@ -1899,37 +1961,9 @@ rename_pending_loose_objects (OstreeRepo *self,
if (!glnx_renameat (child_dfd_iter.fd, loose_objpath + 3,
self->objects_dir_fd, loose_objpath, error))
return FALSE;

renamed_some_object = TRUE;
}

if (renamed_some_object && !self->disable_fsync)
{
/* Ensure that in the case of a power cut all the directory metadata that
we want has reached the disk. In particular, we want this before we
update the refs to point to these objects. */
glnx_autofd int target_dir_fd = -1;

loose_objpath[2] = 0;

if (!glnx_opendirat (self->objects_dir_fd,
loose_objpath, FALSE,
&target_dir_fd,
error))
return FALSE;

if (fsync (target_dir_fd) == -1)
return glnx_throw_errno_prefix (error, "fsync");
}
}

/* In case we created any loose object subdirs, make sure they are on disk */
if (!self->disable_fsync)
{
if (fsync (self->objects_dir_fd) == -1)
return glnx_throw_errno_prefix (error, "fsync");
}

return TRUE;
}

Expand Down Expand Up @@ -2377,6 +2411,9 @@ ostree_repo_commit_transaction (OstreeRepo *self,
if (!rename_pending_loose_objects (self, cancellable, error))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't we skip this function entirely in the case where we're writing directly into the object dir?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably, but per above we'd need to make more of the transaction/staging dir conditional.

return FALSE;

if (!fsync_object_dirs (self, cancellable, error))
return FALSE;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a purpose to the refactor into a separate function? Seems like fsync behaviour in this path hasn't really changed here, right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In per-object-fsync mode we aren't using the staging dir; we still allocate one because making that conditional would be a notably bigger patch. So we need to sync the actual target objects/ directory. And in any case I think we can get away with doing it this way rather than syncing the staging dir in both cases.


g_debug ("txn commit %s", glnx_basename (self->commit_stagedir.path));
if (!glnx_tmpdir_delete (&self->commit_stagedir, cancellable, error))
return FALSE;
Expand Down
17 changes: 11 additions & 6 deletions src/libostree/ostree-repo-private.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,17 @@ G_BEGIN_DECLS
#define _OSTREE_MAX_OUTSTANDING_FETCHER_REQUESTS 8
#define _OSTREE_MAX_OUTSTANDING_DELTAPART_REQUESTS 2

/* In most cases, writing to disk should be much faster than
* fetching from the network, so we shouldn't actually hit
* this. But if using pipelining and e.g. pulling over LAN
* (or writing to slow media), we can have a runaway
* situation towards EMFILE.
/* We want some parallelism with disk writes, but we also
* want to avoid starting tens or hundreds of threads
* (via GTask) all writing to disk. Eventually we may
* use io_uring which handles backpressure correctly.
* Also, in "immediate fsync" mode, this helps provide
* much more backpressure, helping our I/O patterns
* be nicer for any concurrent processes, such as etcd
* or other databases.
* https://github.com/openshift/machine-config-operator/issues/1897
* */
#define _OSTREE_MAX_OUTSTANDING_WRITE_REQUESTS 16
#define _OSTREE_MAX_OUTSTANDING_WRITE_REQUESTS 3
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It shouldn't though would be good to do a sanity-check that over-the-network pull performance isn't affected by this.

Copy link
Member Author

@cgwalters cgwalters Jul 17, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With plain kola http-server over localhost I get 155MB/s and changing this slows it down from 9s to 15s...but that doesn't really matter a lot IMO.

I briefly investigated trying out tc but ended up doing strace -f -e write --inject=write:delay_enter=5ms kola http-server 2>/dev/null and there's no appreciable difference with that (~21MB/s, still a quite fast connection speed obviously). Both end up at 36s for my test case.


/* Well-known keys for the additional metadata field in a summary file. */
#define OSTREE_SUMMARY_LAST_MODIFIED "ostree.summary.last-modified"
Expand Down Expand Up @@ -147,6 +151,7 @@ struct OstreeRepo {
GError *writable_error;
gboolean in_transaction;
gboolean disable_fsync;
gboolean per_object_fsync;
gboolean disable_xattrs;
guint zlib_compression_level;
GHashTable *loose_object_devino_hash;
Expand Down
7 changes: 7 additions & 0 deletions src/libostree/ostree-repo-pull.c
Original file line number Diff line number Diff line change
Expand Up @@ -3286,6 +3286,7 @@ initiate_request (OtPullData *pull_data,
* * disable-sign-verify (b): Disable signapi verification of commits
* * disable-sign-verify-summary (b): Disable signapi verification of the summary
* * depth (i): How far in the history to traverse; default is 0, -1 means infinite
* * per-object-fsync (b): Perform disk writes more slowly, avoiding a single large I/O sync
* * disable-static-deltas (b): Do not use static deltas
* * require-static-deltas (b): Require static deltas
* * override-commit-ids (as): Array of specific commit IDs to fetch for refs
Expand Down Expand Up @@ -3340,6 +3341,7 @@ ostree_repo_pull_with_options (OstreeRepo *self,
g_autoptr(GVariantIter) collection_refs_iter = NULL;
g_autofree char **override_commit_ids = NULL;
g_autoptr(GSource) update_timeout = NULL;
gboolean opt_per_object_fsync = FALSE;
gboolean opt_gpg_verify_set = FALSE;
gboolean opt_gpg_verify_summary_set = FALSE;
gboolean opt_collection_refs_set = FALSE;
Expand Down Expand Up @@ -3385,6 +3387,7 @@ ostree_repo_pull_with_options (OstreeRepo *self,
(void) g_variant_lookup (options, "require-static-deltas", "b", &pull_data->require_static_deltas);
(void) g_variant_lookup (options, "override-commit-ids", "^a&s", &override_commit_ids);
(void) g_variant_lookup (options, "dry-run", "b", &pull_data->dry_run);
(void) g_variant_lookup (options, "per-object-fsync", "b", &opt_per_object_fsync);
(void) g_variant_lookup (options, "override-url", "&s", &url_override);
(void) g_variant_lookup (options, "inherit-transaction", "b", &inherit_transaction);
(void) g_variant_lookup (options, "http-headers", "@a(ss)", &pull_data->extra_headers);
Expand Down Expand Up @@ -3453,6 +3456,10 @@ ostree_repo_pull_with_options (OstreeRepo *self,
pull_data->main_context = g_main_context_ref_thread_default ();
pull_data->flags = flags;

/* TODO: Avoid mutating the repo object */
if (opt_per_object_fsync)
self->per_object_fsync = TRUE;

if (!opt_n_network_retries_set)
pull_data->n_network_retries = DEFAULT_N_NETWORK_RETRIES;

Expand Down
4 changes: 4 additions & 0 deletions src/libostree/ostree-repo.c
Original file line number Diff line number Diff line change
Expand Up @@ -2922,6 +2922,10 @@ reload_core_config (OstreeRepo *self,
ostree_repo_set_disable_fsync (self, TRUE);
}

if (!ot_keyfile_get_boolean_with_default (self->config, "core", "per-object-fsync",
FALSE, &self->per_object_fsync, error))
return FALSE;

/* See https://github.com/ostreedev/ostree/issues/758 */
if (!ot_keyfile_get_boolean_with_default (self->config, "core", "disable-xattrs",
FALSE, &self->disable_xattrs, error))
Expand Down
5 changes: 5 additions & 0 deletions src/ostree/ot-builtin-pull-local.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
static char *opt_remote;
static gboolean opt_commit_only;
static gboolean opt_disable_fsync;
static gboolean opt_per_object_fsync;
static gboolean opt_untrusted;
static gboolean opt_bareuseronly_files;
static gboolean opt_require_static_deltas;
Expand All @@ -50,6 +51,7 @@ static GOptionEntry options[] = {
{ "commit-metadata-only", 0, 0, G_OPTION_ARG_NONE, &opt_commit_only, "Fetch only the commit metadata", NULL },
{ "remote", 0, 0, G_OPTION_ARG_STRING, &opt_remote, "Add REMOTE to refspec", "REMOTE" },
{ "disable-fsync", 0, 0, G_OPTION_ARG_NONE, &opt_disable_fsync, "Do not invoke fsync()", NULL },
{ "per-object-fsync", 0, 0, G_OPTION_ARG_NONE, &opt_per_object_fsync, "Perform writes in such a way that avoids stalling concurrent processes", NULL },
{ "untrusted", 0, 0, G_OPTION_ARG_NONE, &opt_untrusted, "Verify checksums of local sources (always enabled for HTTP pulls)", NULL },
{ "bareuseronly-files", 0, 0, G_OPTION_ARG_NONE, &opt_bareuseronly_files, "Reject regular files with mode outside of 0775 (world writable, suid, etc.)", NULL },
{ "require-static-deltas", 0, 0, G_OPTION_ARG_NONE, &opt_require_static_deltas, "Require static deltas", NULL },
Expand Down Expand Up @@ -188,6 +190,9 @@ ostree_builtin_pull_local (int argc, char **argv, OstreeCommandInvocation *invoc
g_variant_new_variant (g_variant_new_boolean (TRUE)));
g_variant_builder_add (&builder, "{s@v}", "disable-sign-verify-summary",
g_variant_new_variant (g_variant_new_boolean (TRUE)));
if (opt_per_object_fsync)
g_variant_builder_add (&builder, "{s@v}", "per-object-fsync",
g_variant_new_variant (g_variant_new_boolean (TRUE)));

if (console.is_tty)
progress = ostree_async_progress_new_and_connect (ostree_repo_pull_default_console_progress_changed, &console);
Expand Down
6 changes: 5 additions & 1 deletion src/ostree/ot-builtin-pull.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "otutil.h"

static gboolean opt_disable_fsync;
static gboolean opt_per_object_fsync;
static gboolean opt_mirror;
static gboolean opt_commit_only;
static gboolean opt_dry_run;
Expand Down Expand Up @@ -58,6 +59,7 @@ static GOptionEntry options[] = {
{ "commit-metadata-only", 0, 0, G_OPTION_ARG_NONE, &opt_commit_only, "Fetch only the commit metadata", NULL },
{ "cache-dir", 0, 0, G_OPTION_ARG_FILENAME, &opt_cache_dir, "Use custom cache dir", NULL },
{ "disable-fsync", 0, 0, G_OPTION_ARG_NONE, &opt_disable_fsync, "Do not invoke fsync()", NULL },
{ "per-object-fsync", 0, 0, G_OPTION_ARG_NONE, &opt_per_object_fsync, "Perform writes in such a way that avoids stalling concurrent processes", NULL },
{ "disable-static-deltas", 0, 0, G_OPTION_ARG_NONE, &opt_disable_static_deltas, "Do not use static deltas", NULL },
{ "require-static-deltas", 0, 0, G_OPTION_ARG_NONE, &opt_require_static_deltas, "Require static deltas", NULL },
{ "mirror", 0, 0, G_OPTION_ARG_NONE, &opt_mirror, "Write refs suitable for a mirror and fetches all refs if none provided", NULL },
Expand Down Expand Up @@ -325,7 +327,9 @@ ostree_builtin_pull (int argc, char **argv, OstreeCommandInvocation *invocation,
if (opt_localcache_repos)
g_variant_builder_add (&builder, "{s@v}", "localcache-repos",
g_variant_new_variant (g_variant_new_strv ((const char*const*)opt_localcache_repos, -1)));

if (opt_per_object_fsync)
g_variant_builder_add (&builder, "{s@v}", "per-object-fsync",
g_variant_new_variant (g_variant_new_boolean (TRUE)));
if (opt_http_headers)
{
GVariantBuilder hdr_builder;
Expand Down
14 changes: 12 additions & 2 deletions tests/pull-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,10 @@ function verify_initial_contents() {
}

if has_gpgme; then
echo "1..34"
echo "1..35"
else
# 3 tests needs GPG support
echo "1..31"
echo "1..32"
fi

# Try both syntaxes
Expand All @@ -74,6 +74,16 @@ cd ${test_tmpdir}
verify_initial_contents
echo "ok pull contents"

# And a test with incremental fsync
repo_init --no-sign-verify
${CMD_PREFIX} ostree --repo=repo pull --per-object-fsync origin main >out.txt
assert_file_has_content out.txt "[1-9][0-9]* metadata, [1-9][0-9]* content objects fetched"
${CMD_PREFIX} ostree --repo=repo pull --per-object-fsync origin:main > out.txt
assert_not_file_has_content out.txt "[1-9][0-9]* content objects fetched"
${CMD_PREFIX} ostree --repo=repo fsck
verify_initial_contents
echo "ok pull --per-object-fsync"

cd ${test_tmpdir}
mkdir mirrorrepo
ostree_repo_init mirrorrepo --mode=archive
Expand Down