Skip to content

Commit b436b9b

Browse files
jankaratytso
authored andcommitted
ext4: Wait for proper transaction commit on fsync
We cannot rely on buffer dirty bits during fsync because pdflush can come before fsync is called and clear dirty bits without forcing a transaction commit. What we do is that we track which transaction has last changed the inode and which transaction last changed allocation and force it to disk on fsync. Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
1 parent 194074a commit b436b9b

File tree

6 files changed

+80
-31
lines changed

6 files changed

+80
-31
lines changed

fs/ext4/ext4.h

+7
Original file line numberDiff line numberDiff line change
@@ -709,6 +709,13 @@ struct ext4_inode_info {
709709
struct list_head i_aio_dio_complete_list;
710710
/* current io_end structure for async DIO write*/
711711
ext4_io_end_t *cur_aio_dio;
712+
713+
/*
714+
* Transactions that contain inode's metadata needed to complete
715+
* fsync and fdatasync, respectively.
716+
*/
717+
tid_t i_sync_tid;
718+
tid_t i_datasync_tid;
712719
};
713720

714721
/*

fs/ext4/ext4_jbd2.h

+13
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
249249
return 0;
250250
}
251251

252+
static inline void ext4_update_inode_fsync_trans(handle_t *handle,
253+
struct inode *inode,
254+
int datasync)
255+
{
256+
struct ext4_inode_info *ei = EXT4_I(inode);
257+
258+
if (ext4_handle_valid(handle)) {
259+
ei->i_sync_tid = handle->h_transaction->t_tid;
260+
if (datasync)
261+
ei->i_datasync_tid = handle->h_transaction->t_tid;
262+
}
263+
}
264+
252265
/* super.c */
253266
int ext4_force_commit(struct super_block *sb);
254267

fs/ext4/extents.c

+12-2
Original file line numberDiff line numberDiff line change
@@ -3058,6 +3058,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
30583058
if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
30593059
ret = ext4_convert_unwritten_extents_dio(handle, inode,
30603060
path);
3061+
if (ret >= 0)
3062+
ext4_update_inode_fsync_trans(handle, inode, 1);
30613063
goto out2;
30623064
}
30633065
/* buffered IO case */
@@ -3085,6 +3087,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
30853087
ret = ext4_ext_convert_to_initialized(handle, inode,
30863088
path, iblock,
30873089
max_blocks);
3090+
if (ret >= 0)
3091+
ext4_update_inode_fsync_trans(handle, inode, 1);
30883092
out:
30893093
if (ret <= 0) {
30903094
err = ret;
@@ -3323,10 +3327,16 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
33233327
allocated = ext4_ext_get_actual_len(&newex);
33243328
set_buffer_new(bh_result);
33253329

3326-
/* Cache only when it is _not_ an uninitialized extent */
3327-
if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
3330+
/*
3331+
* Cache the extent and update transaction to commit on fdatasync only
3332+
* when it is _not_ an uninitialized extent.
3333+
*/
3334+
if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
33283335
ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
33293336
EXT4_EXT_CACHE_EXTENT);
3337+
ext4_update_inode_fsync_trans(handle, inode, 1);
3338+
} else
3339+
ext4_update_inode_fsync_trans(handle, inode, 0);
33303340
out:
33313341
if (allocated > max_blocks)
33323342
allocated = max_blocks;

fs/ext4/fsync.c

+17-29
Original file line numberDiff line numberDiff line change
@@ -51,25 +51,30 @@
5151
int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
5252
{
5353
struct inode *inode = dentry->d_inode;
54+
struct ext4_inode_info *ei = EXT4_I(inode);
5455
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
55-
int err, ret = 0;
56+
int ret;
57+
tid_t commit_tid;
5658

5759
J_ASSERT(ext4_journal_current_handle() == NULL);
5860

5961
trace_ext4_sync_file(file, dentry, datasync);
6062

63+
if (inode->i_sb->s_flags & MS_RDONLY)
64+
return 0;
65+
6166
ret = flush_aio_dio_completed_IO(inode);
6267
if (ret < 0)
6368
return ret;
69+
70+
if (!journal)
71+
return simple_fsync(file, dentry, datasync);
72+
6473
/*
65-
* data=writeback:
74+
* data=writeback,ordered:
6675
* The caller's filemap_fdatawrite()/wait will sync the data.
67-
* sync_inode() will sync the metadata
68-
*
69-
* data=ordered:
70-
* The caller's filemap_fdatawrite() will write the data and
71-
* sync_inode() will write the inode if it is dirty. Then the caller's
72-
* filemap_fdatawait() will wait on the pages.
76+
* Metadata is in the journal, we wait for proper transaction to
77+
* commit here.
7378
*
7479
* data=journal:
7580
* filemap_fdatawrite won't do anything (the buffers are clean).
@@ -82,27 +87,10 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
8287
if (ext4_should_journal_data(inode))
8388
return ext4_force_commit(inode->i_sb);
8489

85-
if (!journal)
86-
ret = sync_mapping_buffers(inode->i_mapping);
87-
88-
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
89-
goto out;
90-
91-
/*
92-
* The VFS has written the file data. If the inode is unaltered
93-
* then we need not start a commit.
94-
*/
95-
if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
96-
struct writeback_control wbc = {
97-
.sync_mode = WB_SYNC_ALL,
98-
.nr_to_write = 0, /* sys_fsync did this */
99-
};
100-
err = sync_inode(inode, &wbc);
101-
if (ret == 0)
102-
ret = err;
103-
}
104-
out:
105-
if (journal && (journal->j_flags & JBD2_BARRIER))
90+
commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
91+
if (jbd2_log_start_commit(journal, commit_tid))
92+
jbd2_log_wait_commit(journal, commit_tid);
93+
else if (journal->j_flags & JBD2_BARRIER)
10694
blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
10795
return ret;
10896
}

fs/ext4/inode.c

+29
Original file line numberDiff line numberDiff line change
@@ -983,6 +983,8 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
983983
goto cleanup;
984984

985985
set_buffer_new(bh_result);
986+
987+
ext4_update_inode_fsync_trans(handle, inode, 1);
986988
got_it:
987989
map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
988990
if (count > blocks_to_boundary)
@@ -4738,6 +4740,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
47384740
struct ext4_inode *raw_inode;
47394741
struct ext4_inode_info *ei;
47404742
struct inode *inode;
4743+
journal_t *journal = EXT4_SB(sb)->s_journal;
47414744
long ret;
47424745
int block;
47434746

@@ -4802,6 +4805,31 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
48024805
ei->i_data[block] = raw_inode->i_block[block];
48034806
INIT_LIST_HEAD(&ei->i_orphan);
48044807

4808+
/*
4809+
* Set transaction id's of transactions that have to be committed
4810+
* to finish f[data]sync. We set them to currently running transaction
4811+
* as we cannot be sure that the inode or some of its metadata isn't
4812+
* part of the transaction - the inode could have been reclaimed and
4813+
* now it is reread from disk.
4814+
*/
4815+
if (journal) {
4816+
transaction_t *transaction;
4817+
tid_t tid;
4818+
4819+
spin_lock(&journal->j_state_lock);
4820+
if (journal->j_running_transaction)
4821+
transaction = journal->j_running_transaction;
4822+
else
4823+
transaction = journal->j_committing_transaction;
4824+
if (transaction)
4825+
tid = transaction->t_tid;
4826+
else
4827+
tid = journal->j_commit_sequence;
4828+
spin_unlock(&journal->j_state_lock);
4829+
ei->i_sync_tid = tid;
4830+
ei->i_datasync_tid = tid;
4831+
}
4832+
48054833
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
48064834
ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
48074835
if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
@@ -5056,6 +5084,7 @@ static int ext4_do_update_inode(handle_t *handle,
50565084
err = rc;
50575085
ei->i_state &= ~EXT4_STATE_NEW;
50585086

5087+
ext4_update_inode_fsync_trans(handle, inode, 0);
50595088
out_brelse:
50605089
brelse(bh);
50615090
ext4_std_error(inode->i_sb, err);

fs/ext4/super.c

+2
Original file line numberDiff line numberDiff line change
@@ -706,6 +706,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
706706
spin_lock_init(&(ei->i_block_reservation_lock));
707707
INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
708708
ei->cur_aio_dio = NULL;
709+
ei->i_sync_tid = 0;
710+
ei->i_datasync_tid = 0;
709711

710712
return &ei->vfs_inode;
711713
}

0 commit comments

Comments
 (0)