-
Notifications
You must be signed in to change notification settings - Fork 339
pull: Add --per-object-fsync #2152
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -51,15 +51,33 @@ | |
| #define FICLONE _IOW(0x94, 9, int) | ||
| #endif | ||
|
|
||
|
|
||
| /* If fsync is enabled and we're in a txn, we write into a staging dir for | ||
| * commit, but we also allow direct writes into objects/ for e.g. hardlink | ||
| * imports. | ||
| /* Understanding ostree's fsync strategy | ||
| * | ||
| * A long time ago, ostree used to invoke fsync() on each object, | ||
| * then move it into the objects directory. However, it turned | ||
| * out to be a *lot* faster to write the objects into a separate "staging" | ||
| * directory (letting the filesystem handle writeback how it likes) | ||
| * and then only walk over each of the files, fsync(), then rename() | ||
| * into place. See also https://lwn.net/Articles/789024/ | ||
| * | ||
| * (We also support a "disable fsync entirely" mode, where you don't | ||
| * care about integrity; e.g. test suites using disposable VMs). | ||
| * | ||
| * This "delayed fsync" pattern though is much worse for other concurrent processes | ||
| * like databases because it forces a lot to go through the filesystem | ||
| * journal at once once we do the sync. So now we support a `per_object_fsync` | ||
| * option that again invokes `fsync()` directly. This also notably | ||
| * provides "backpressure", ensuring we aren't queuing up a huge amount | ||
| * of I/O at once. | ||
| */ | ||
|
|
||
| /* The directory where we place content */ | ||
| static int | ||
| commit_dest_dfd (OstreeRepo *self) | ||
| { | ||
| if (self->in_transaction && !self->disable_fsync) | ||
| if (self->per_object_fsync) | ||
| return self->objects_dir_fd; | ||
| else if (self->in_transaction && !self->disable_fsync) | ||
| return self->commit_stagedir.fd; | ||
| else | ||
| return self->objects_dir_fd; | ||
|
|
@@ -420,7 +438,7 @@ commit_loose_regfile_object (OstreeRepo *self, | |
| /* Ensure that in case of a power cut, these files have the data we | ||
| * want. See http://lwn.net/Articles/322823/ | ||
| */ | ||
| if (!self->in_transaction && !self->disable_fsync) | ||
| if (!self->disable_fsync && self->per_object_fsync) | ||
| { | ||
| if (fsync (tmpf->fd) == -1) | ||
| return glnx_throw_errno_prefix (error, "fsync"); | ||
|
|
@@ -1835,6 +1853,52 @@ ostree_repo_prepare_transaction (OstreeRepo *self, | |
| return TRUE; | ||
| } | ||
|
|
||
| /* Synchronize the directories holding the objects */ | ||
| static gboolean | ||
| fsync_object_dirs (OstreeRepo *self, | ||
| GCancellable *cancellable, | ||
| GError **error) | ||
| { | ||
| GLNX_AUTO_PREFIX_ERROR ("fsync objdirs", error); | ||
| g_auto(GLnxDirFdIterator) dfd_iter = { 0, }; | ||
|
|
||
| if (self->disable_fsync) | ||
| return TRUE; /* No fsync? Nothing to do then. */ | ||
|
|
||
| if (!glnx_dirfd_iterator_init_at (self->objects_dir_fd, ".", FALSE, &dfd_iter, error)) | ||
| return FALSE; | ||
| while (TRUE) | ||
| { | ||
| struct dirent *dent; | ||
| if (!glnx_dirfd_iterator_next_dent_ensure_dtype (&dfd_iter, &dent, cancellable, error)) | ||
| return FALSE; | ||
| if (dent == NULL) | ||
| break; | ||
| if (dent->d_type != DT_DIR) | ||
| continue; | ||
| /* All object directories only have two character entries */ | ||
| if (strlen (dent->d_name) != 2) | ||
| continue; | ||
|
|
||
| glnx_autofd int target_dir_fd = -1; | ||
| if (!glnx_opendirat (self->objects_dir_fd, dent->d_name, FALSE, | ||
| &target_dir_fd, error)) | ||
| return FALSE; | ||
| /* This synchronizes the directory to ensure all the objects we wrote | ||
| * are there. We need to do this before removing the .commitpartial | ||
| * stamp (or have a ref point to the commit). | ||
| */ | ||
| if (fsync (target_dir_fd) == -1) | ||
| return glnx_throw_errno_prefix (error, "fsync"); | ||
| } | ||
|
|
||
| /* In case we created any loose object subdirs, make sure they are on disk */ | ||
| if (fsync (self->objects_dir_fd) == -1) | ||
| return glnx_throw_errno_prefix (error, "fsync"); | ||
|
|
||
| return TRUE; | ||
| } | ||
|
|
||
| /* Called for commit, to iterate over the "staging" directory and rename all the | ||
| * objects into the primary objects/ location. Notably this is called only after | ||
| * syncfs() has potentially been invoked to ensure that all objects have been | ||
|
|
@@ -1856,10 +1920,6 @@ rename_pending_loose_objects (OstreeRepo *self, | |
| while (TRUE) | ||
| { | ||
| struct dirent *dent; | ||
| gboolean renamed_some_object = FALSE; | ||
| g_auto(GLnxDirFdIterator) child_dfd_iter = { 0, }; | ||
| char loose_objpath[_OSTREE_LOOSE_PATH_MAX]; | ||
|
|
||
| if (!glnx_dirfd_iterator_next_dent_ensure_dtype (&dfd_iter, &dent, cancellable, error)) | ||
| return FALSE; | ||
| if (dent == NULL) | ||
|
|
@@ -1872,10 +1932,12 @@ rename_pending_loose_objects (OstreeRepo *self, | |
| if (strlen (dent->d_name) != 2) | ||
| continue; | ||
|
|
||
| g_auto(GLnxDirFdIterator) child_dfd_iter = { 0, }; | ||
| if (!glnx_dirfd_iterator_init_at (dfd_iter.fd, dent->d_name, FALSE, | ||
| &child_dfd_iter, error)) | ||
| return FALSE; | ||
|
|
||
| char loose_objpath[_OSTREE_LOOSE_PATH_MAX]; | ||
| loose_objpath[0] = dent->d_name[0]; | ||
| loose_objpath[1] = dent->d_name[1]; | ||
| loose_objpath[2] = '/'; | ||
|
|
@@ -1899,37 +1961,9 @@ rename_pending_loose_objects (OstreeRepo *self, | |
| if (!glnx_renameat (child_dfd_iter.fd, loose_objpath + 3, | ||
| self->objects_dir_fd, loose_objpath, error)) | ||
| return FALSE; | ||
|
|
||
| renamed_some_object = TRUE; | ||
| } | ||
|
|
||
| if (renamed_some_object && !self->disable_fsync) | ||
| { | ||
| /* Ensure that in the case of a power cut all the directory metadata that | ||
| we want has reached the disk. In particular, we want this before we | ||
| update the refs to point to these objects. */ | ||
| glnx_autofd int target_dir_fd = -1; | ||
|
|
||
| loose_objpath[2] = 0; | ||
|
|
||
| if (!glnx_opendirat (self->objects_dir_fd, | ||
| loose_objpath, FALSE, | ||
| &target_dir_fd, | ||
| error)) | ||
| return FALSE; | ||
|
|
||
| if (fsync (target_dir_fd) == -1) | ||
| return glnx_throw_errno_prefix (error, "fsync"); | ||
| } | ||
| } | ||
|
|
||
| /* In case we created any loose object subdirs, make sure they are on disk */ | ||
| if (!self->disable_fsync) | ||
| { | ||
| if (fsync (self->objects_dir_fd) == -1) | ||
| return glnx_throw_errno_prefix (error, "fsync"); | ||
| } | ||
|
|
||
| return TRUE; | ||
| } | ||
|
|
||
|
|
@@ -2377,6 +2411,9 @@ ostree_repo_commit_transaction (OstreeRepo *self, | |
| if (!rename_pending_loose_objects (self, cancellable, error)) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't we skip this function entirely in the case where we're writing directly into the object dir?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably, but per above we'd need to make more of the transaction/staging dir conditional. |
||
| return FALSE; | ||
|
|
||
| if (!fsync_object_dirs (self, cancellable, error)) | ||
| return FALSE; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a purpose to the refactor into a separate function? Seems like
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In |
||
|
|
||
| g_debug ("txn commit %s", glnx_basename (self->commit_stagedir.path)); | ||
| if (!glnx_tmpdir_delete (&self->commit_stagedir, cancellable, error)) | ||
| return FALSE; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,13 +38,17 @@ G_BEGIN_DECLS | |
| #define _OSTREE_MAX_OUTSTANDING_FETCHER_REQUESTS 8 | ||
| #define _OSTREE_MAX_OUTSTANDING_DELTAPART_REQUESTS 2 | ||
|
|
||
| /* In most cases, writing to disk should be much faster than | ||
| * fetching from the network, so we shouldn't actually hit | ||
| * this. But if using pipelining and e.g. pulling over LAN | ||
| * (or writing to slow media), we can have a runaway | ||
| * situation towards EMFILE. | ||
| /* We want some parallelism with disk writes, but we also | ||
| * want to avoid starting tens or hundreds of threads | ||
| * (via GTask) all writing to disk. Eventually we may | ||
| * use io_uring which handles backpressure correctly. | ||
| * Also, in "immediate fsync" mode, this helps provide | ||
| * much more backpressure, helping our I/O patterns | ||
| * be nicer for any concurrent processes, such as etcd | ||
| * or other databases. | ||
| * https://github.com/openshift/machine-config-operator/issues/1897 | ||
| * */ | ||
| #define _OSTREE_MAX_OUTSTANDING_WRITE_REQUESTS 16 | ||
| #define _OSTREE_MAX_OUTSTANDING_WRITE_REQUESTS 3 | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It shouldn't though would be good to do a sanity-check that over-the-network pull performance isn't affected by this.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With plain I briefly investigated trying out |
||
|
|
||
| /* Well-known keys for the additional metadata field in a summary file. */ | ||
| #define OSTREE_SUMMARY_LAST_MODIFIED "ostree.summary.last-modified" | ||
|
|
@@ -147,6 +151,7 @@ struct OstreeRepo { | |
| GError *writable_error; | ||
| gboolean in_transaction; | ||
| gboolean disable_fsync; | ||
| gboolean per_object_fsync; | ||
| gboolean disable_xattrs; | ||
| guint zlib_compression_level; | ||
| GHashTable *loose_object_devino_hash; | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.