Skip to content

Commit 44f714d

Browse files
committed
Btrfs: improve performance on fsync against new inode after rename/unlink
With commit 56f23fd ("Btrfs: fix file/data loss caused by fsync after rename and new inode") we got simple fix for a functional issue when the following sequence of actions is done: at transaction N create file A at directory D at transaction N + M (where M >= 1) move/rename existing file A from directory D to directory E create a new file named A at directory D fsync the new file power fail The solution was to simply detect such scenario and fallback to a full transaction commit when we detect it. However this turned out to had a significant impact on throughput (and a bit on latency too) for benchmarks using the dbench tool, which simulates real workloads from smbd (Samba) servers. For example on a test vm (with a debug kernel): Unpatched: Throughput 19.1572 MB/sec 32 clients 32 procs max_latency=1005.229 ms Patched: Throughput 23.7015 MB/sec 32 clients 32 procs max_latency=809.206 ms The patched results (this patch is applied) are similar to the results of a kernel with the commit 56f23fd ("Btrfs: fix file/data loss caused by fsync after rename and new inode") reverted. This change avoids the fallback to a transaction commit and instead makes sure all the names of the conflicting inode (the one that had a name in a past transaction that matches the name of the new file in the same parent directory) are logged so that at log replay time we don't lose neither the new file nor the old file, and the old file gets the name it was renamed to. This also ends up avoiding a full transaction commit for a similar case that involves an unlink instead of a rename of the old file: at transaction N create file A at directory D at transaction N + M (where M >= 1) remove file A create a new file named A at directory D fsync the new file power fail Signed-off-by: Filipe Manana <[email protected]>
1 parent 6771089 commit 44f714d

File tree

2 files changed

+95
-9
lines changed

2 files changed

+95
-9
lines changed

fs/btrfs/inode.c

+18-1
Original file line numberDiff line numberDiff line change
@@ -4203,6 +4203,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
42034203
int err = 0;
42044204
struct btrfs_root *root = BTRFS_I(dir)->root;
42054205
struct btrfs_trans_handle *trans;
4206+
u64 last_unlink_trans;
42064207

42074208
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
42084209
return -ENOTEMPTY;
@@ -4225,11 +4226,27 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
42254226
if (err)
42264227
goto out;
42274228

4229+
last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
4230+
42284231
/* now the directory is empty */
42294232
err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry),
42304233
dentry->d_name.name, dentry->d_name.len);
4231-
if (!err)
4234+
if (!err) {
42324235
btrfs_i_size_write(inode, 0);
4236+
/*
4237+
* Propagate the last_unlink_trans value of the deleted dir to
4238+
* its parent directory. This is to prevent an unrecoverable
4239+
* log tree in the case we do something like this:
4240+
* 1) create dir foo
4241+
* 2) create snapshot under dir foo
4242+
* 3) delete the snapshot
4243+
* 4) rmdir foo
4244+
* 5) mkdir foo
4245+
* 6) fsync foo or some file inside foo
4246+
*/
4247+
if (last_unlink_trans >= trans->transid)
4248+
BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
4249+
}
42334250
out:
42344251
btrfs_end_transaction(trans, root);
42354252
btrfs_btree_balance_dirty(root);

fs/btrfs/tree-log.c

+77-8
Original file line numberDiff line numberDiff line change
@@ -4469,7 +4469,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
44694469
static int btrfs_check_ref_name_override(struct extent_buffer *eb,
44704470
const int slot,
44714471
const struct btrfs_key *key,
4472-
struct inode *inode)
4472+
struct inode *inode,
4473+
u64 *other_ino)
44734474
{
44744475
int ret;
44754476
struct btrfs_path *search_path;
@@ -4528,7 +4529,16 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
45284529
search_path, parent,
45294530
name, this_name_len, 0);
45304531
if (di && !IS_ERR(di)) {
4531-
ret = 1;
4532+
struct btrfs_key di_key;
4533+
4534+
btrfs_dir_item_key_to_cpu(search_path->nodes[0],
4535+
di, &di_key);
4536+
if (di_key.type == BTRFS_INODE_ITEM_KEY) {
4537+
ret = 1;
4538+
*other_ino = di_key.objectid;
4539+
} else {
4540+
ret = -EAGAIN;
4541+
}
45324542
goto out;
45334543
} else if (IS_ERR(di)) {
45344544
ret = PTR_ERR(di);
@@ -4718,16 +4728,71 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
47184728
if ((min_key.type == BTRFS_INODE_REF_KEY ||
47194729
min_key.type == BTRFS_INODE_EXTREF_KEY) &&
47204730
BTRFS_I(inode)->generation == trans->transid) {
4731+
u64 other_ino = 0;
4732+
47214733
ret = btrfs_check_ref_name_override(path->nodes[0],
47224734
path->slots[0],
4723-
&min_key, inode);
4735+
&min_key, inode,
4736+
&other_ino);
47244737
if (ret < 0) {
47254738
err = ret;
47264739
goto out_unlock;
47274740
} else if (ret > 0) {
4728-
err = 1;
4729-
btrfs_set_log_full_commit(root->fs_info, trans);
4730-
goto out_unlock;
4741+
struct btrfs_key inode_key;
4742+
struct inode *other_inode;
4743+
4744+
if (ins_nr > 0) {
4745+
ins_nr++;
4746+
} else {
4747+
ins_nr = 1;
4748+
ins_start_slot = path->slots[0];
4749+
}
4750+
ret = copy_items(trans, inode, dst_path, path,
4751+
&last_extent, ins_start_slot,
4752+
ins_nr, inode_only,
4753+
logged_isize);
4754+
if (ret < 0) {
4755+
err = ret;
4756+
goto out_unlock;
4757+
}
4758+
ins_nr = 0;
4759+
btrfs_release_path(path);
4760+
inode_key.objectid = other_ino;
4761+
inode_key.type = BTRFS_INODE_ITEM_KEY;
4762+
inode_key.offset = 0;
4763+
other_inode = btrfs_iget(root->fs_info->sb,
4764+
&inode_key, root,
4765+
NULL);
4766+
/*
4767+
* If the other inode that had a conflicting dir
4768+
* entry was deleted in the current transaction,
4769+
* we don't need to do more work nor fallback to
4770+
* a transaction commit.
4771+
*/
4772+
if (IS_ERR(other_inode) &&
4773+
PTR_ERR(other_inode) == -ENOENT) {
4774+
goto next_key;
4775+
} else if (IS_ERR(other_inode)) {
4776+
err = PTR_ERR(other_inode);
4777+
goto out_unlock;
4778+
}
4779+
/*
4780+
* We are safe logging the other inode without
4781+
* acquiring its i_mutex as long as we log with
4782+
* the LOG_INODE_EXISTS mode. We're safe against
4783+
* concurrent renames of the other inode as well
4784+
* because during a rename we pin the log and
4785+
* update the log with the new name before we
4786+
* unpin it.
4787+
*/
4788+
err = btrfs_log_inode(trans, root, other_inode,
4789+
LOG_INODE_EXISTS,
4790+
0, LLONG_MAX, ctx);
4791+
iput(other_inode);
4792+
if (err)
4793+
goto out_unlock;
4794+
else
4795+
goto next_key;
47314796
}
47324797
}
47334798

@@ -4795,7 +4860,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
47954860
ins_nr = 0;
47964861
}
47974862
btrfs_release_path(path);
4798-
4863+
next_key:
47994864
if (min_key.offset < (u64)-1) {
48004865
min_key.offset++;
48014866
} else if (min_key.type < max_key.type) {
@@ -4989,8 +5054,12 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
49895054
if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
49905055
break;
49915056

4992-
if (IS_ROOT(parent))
5057+
if (IS_ROOT(parent)) {
5058+
inode = d_inode(parent);
5059+
if (btrfs_must_commit_transaction(trans, inode))
5060+
ret = 1;
49935061
break;
5062+
}
49945063

49955064
parent = dget_parent(parent);
49965065
dput(old_parent);

0 commit comments

Comments
 (0)