Skip to content

Commit 0421682

Browse files
fdmananamasoncl
authored andcommitted
Btrfs: fix race between fs trimming and block group remove/allocation
Our fs trim operation, which is completely transactionless (doesn't start or joins an existing transaction) consists of visiting all block groups and then for each one to iterate its free space entries and perform a discard operation against the space range represented by the free space entries. However before performing a discard, the corresponding free space entry is removed from the free space rbtree, and when the discard completes it is added back to the free space rbtree. If a block group remove operation happens while the discard is ongoing (or before it starts and after a free space entry is hidden), we end up not waiting for the discard to complete, remove the extent map that maps logical address to physical addresses and the corresponding chunk metadata from the the chunk and device trees. After that and before the discard completes, the current running transaction can finish and a new one start, allowing for new block groups that map to the same physical addresses to be allocated and written to. So fix this by keeping the extent map in memory until the discard completes so that the same physical addresses aren't reused before it completes. If the physical locations that are under a discard operation end up being used for a new metadata block group for example, and dirty metadata extents are written before the discard finishes (the VM might call writepages() of our btree inode's i_mapping for example, or an fsync log commit happens) we end up overwriting metadata with zeroes, which leads to errors from fsck like the following: checking extents Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 read block failed check_tree_block owner ref check failed [833912832 16384] Errors found in extent allocation tree or chunk allocation checking free space cache checking fs roots Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 read block failed check_tree_block root 5 root dir 256 error root 5 inode 260 errors 2001, no inode item, link count wrong unresolved ref dir 256 index 0 namelen 8 name foobar_3 filetype 1 errors 6, no dir index, no inode ref root 5 inode 262 errors 2001, no inode item, link count wrong unresolved ref dir 256 index 0 namelen 8 name foobar_5 filetype 1 errors 6, no dir index, no inode ref root 5 inode 263 errors 2001, no inode item, link count wrong (...) Signed-off-by: Filipe Manana <[email protected]> Signed-off-by: Chris Mason <[email protected]>
1 parent ae0ab00 commit 0421682

File tree

6 files changed

+140
-21
lines changed

6 files changed

+140
-21
lines changed

fs/btrfs/ctree.h

+11-1
Original file line numberDiff line numberDiff line change
@@ -1279,6 +1279,7 @@ struct btrfs_block_group_cache {
12791279
unsigned int dirty:1;
12801280
unsigned int iref:1;
12811281
unsigned int has_caching_ctl:1;
1282+
unsigned int removed:1;
12821283

12831284
int disk_cache_state;
12841285

@@ -1311,6 +1312,8 @@ struct btrfs_block_group_cache {
13111312

13121313
/* For read-only block groups */
13131314
struct list_head ro_list;
1315+
1316+
atomic_t trimming;
13141317
};
13151318

13161319
/* delayed seq elem */
@@ -1740,6 +1743,12 @@ struct btrfs_fs_info {
17401743

17411744
/* For btrfs to record security options */
17421745
struct security_mnt_opts security_opts;
1746+
1747+
/*
1748+
* Chunks that can't be freed yet (under a trim/discard operation)
1749+
* and will be latter freed. Protected by fs_info->chunk_mutex.
1750+
*/
1751+
struct list_head pinned_chunks;
17431752
};
17441753

17451754
struct btrfs_subvolume_writers {
@@ -3405,7 +3414,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
34053414
u64 type, u64 chunk_objectid, u64 chunk_offset,
34063415
u64 size);
34073416
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
3408-
struct btrfs_root *root, u64 group_start);
3417+
struct btrfs_root *root, u64 group_start,
3418+
struct extent_map *em);
34093419
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
34103420
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
34113421
struct btrfs_root *root);

fs/btrfs/disk-io.c

+13
Original file line numberDiff line numberDiff line change
@@ -2384,6 +2384,8 @@ int open_ctree(struct super_block *sb,
23842384
init_waitqueue_head(&fs_info->transaction_blocked_wait);
23852385
init_waitqueue_head(&fs_info->async_submit_wait);
23862386

2387+
INIT_LIST_HEAD(&fs_info->pinned_chunks);
2388+
23872389
ret = btrfs_alloc_stripe_hash_table(fs_info);
23882390
if (ret) {
23892391
err = ret;
@@ -3715,6 +3717,17 @@ void close_ctree(struct btrfs_root *root)
37153717

37163718
btrfs_free_block_rsv(root, root->orphan_block_rsv);
37173719
root->orphan_block_rsv = NULL;
3720+
3721+
lock_chunks(root);
3722+
while (!list_empty(&fs_info->pinned_chunks)) {
3723+
struct extent_map *em;
3724+
3725+
em = list_first_entry(&fs_info->pinned_chunks,
3726+
struct extent_map, list);
3727+
list_del_init(&em->list);
3728+
free_extent_map(em);
3729+
}
3730+
unlock_chunks(root);
37183731
}
37193732

37203733
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,

fs/btrfs/extent-tree.c

+59-1
Original file line numberDiff line numberDiff line change
@@ -9005,6 +9005,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
90059005
INIT_LIST_HEAD(&cache->bg_list);
90069006
INIT_LIST_HEAD(&cache->ro_list);
90079007
btrfs_init_free_space_ctl(cache);
9008+
atomic_set(&cache->trimming, 0);
90089009

90099010
return cache;
90109011
}
@@ -9306,7 +9307,8 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
93069307
}
93079308

93089309
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9309-
struct btrfs_root *root, u64 group_start)
9310+
struct btrfs_root *root, u64 group_start,
9311+
struct extent_map *em)
93109312
{
93119313
struct btrfs_path *path;
93129314
struct btrfs_block_group_cache *block_group;
@@ -9319,6 +9321,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
93199321
int index;
93209322
int factor;
93219323
struct btrfs_caching_control *caching_ctl = NULL;
9324+
bool remove_em;
93229325

93239326
root = root->fs_info->extent_root;
93249327

@@ -9464,6 +9467,61 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
94649467

94659468
memcpy(&key, &block_group->key, sizeof(key));
94669469

9470+
lock_chunks(root);
9471+
spin_lock(&block_group->lock);
9472+
block_group->removed = 1;
9473+
/*
9474+
* At this point trimming can't start on this block group, because we
9475+
* removed the block group from the tree fs_info->block_group_cache_tree
9476+
* so no one can't find it anymore and even if someone already got this
9477+
* block group before we removed it from the rbtree, they have already
9478+
* incremented block_group->trimming - if they didn't, they won't find
9479+
* any free space entries because we already removed them all when we
9480+
* called btrfs_remove_free_space_cache().
9481+
*
9482+
* And we must not remove the extent map from the fs_info->mapping_tree
9483+
* to prevent the same logical address range and physical device space
9484+
* ranges from being reused for a new block group. This is because our
9485+
* fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
9486+
* completely transactionless, so while it is trimming a range the
9487+
* currently running transaction might finish and a new one start,
9488+
* allowing for new block groups to be created that can reuse the same
9489+
* physical device locations unless we take this special care.
9490+
*/
9491+
remove_em = (atomic_read(&block_group->trimming) == 0);
9492+
/*
9493+
* Make sure a trimmer task always sees the em in the pinned_chunks list
9494+
* if it sees block_group->removed == 1 (needs to lock block_group->lock
9495+
* before checking block_group->removed).
9496+
*/
9497+
if (!remove_em) {
9498+
/*
9499+
* Our em might be in trans->transaction->pending_chunks which
9500+
* is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
9501+
* and so is the fs_info->pinned_chunks list.
9502+
*
9503+
* So at this point we must be holding the chunk_mutex to avoid
9504+
* any races with chunk allocation (more specifically at
9505+
* volumes.c:contains_pending_extent()), to ensure it always
9506+
* sees the em, either in the pending_chunks list or in the
9507+
* pinned_chunks list.
9508+
*/
9509+
list_move_tail(&em->list, &root->fs_info->pinned_chunks);
9510+
}
9511+
spin_unlock(&block_group->lock);
9512+
unlock_chunks(root);
9513+
9514+
if (remove_em) {
9515+
struct extent_map_tree *em_tree;
9516+
9517+
em_tree = &root->fs_info->mapping_tree.map_tree;
9518+
write_lock(&em_tree->lock);
9519+
remove_extent_mapping(em_tree, em);
9520+
write_unlock(&em_tree->lock);
9521+
/* once for the tree */
9522+
free_extent_map(em);
9523+
}
9524+
94679525
btrfs_put_block_group(block_group);
94689526
btrfs_put_block_group(block_group);
94699527

fs/btrfs/free-space-cache.c

+37-1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "disk-io.h"
2828
#include "extent_io.h"
2929
#include "inode-map.h"
30+
#include "volumes.h"
3031

3132
#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
3233
#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
@@ -3101,11 +3102,46 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
31013102

31023103
*trimmed = 0;
31033104

3105+
spin_lock(&block_group->lock);
3106+
if (block_group->removed) {
3107+
spin_unlock(&block_group->lock);
3108+
return 0;
3109+
}
3110+
atomic_inc(&block_group->trimming);
3111+
spin_unlock(&block_group->lock);
3112+
31043113
ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
31053114
if (ret)
3106-
return ret;
3115+
goto out;
31073116

31083117
ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
3118+
out:
3119+
spin_lock(&block_group->lock);
3120+
if (atomic_dec_and_test(&block_group->trimming) &&
3121+
block_group->removed) {
3122+
struct extent_map_tree *em_tree;
3123+
struct extent_map *em;
3124+
3125+
spin_unlock(&block_group->lock);
3126+
3127+
em_tree = &block_group->fs_info->mapping_tree.map_tree;
3128+
write_lock(&em_tree->lock);
3129+
em = lookup_extent_mapping(em_tree, block_group->key.objectid,
3130+
1);
3131+
BUG_ON(!em); /* logic error, can't happen */
3132+
remove_extent_mapping(em_tree, em);
3133+
write_unlock(&em_tree->lock);
3134+
3135+
lock_chunks(block_group->fs_info->chunk_root);
3136+
list_del_init(&em->list);
3137+
unlock_chunks(block_group->fs_info->chunk_root);
3138+
3139+
/* once for us and once for the tree */
3140+
free_extent_map(em);
3141+
free_extent_map(em);
3142+
} else {
3143+
spin_unlock(&block_group->lock);
3144+
}
31093145

31103146
return ret;
31113147
}

fs/btrfs/volumes.c

+8-18
Original file line numberDiff line numberDiff line change
@@ -53,16 +53,6 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
5353
DEFINE_MUTEX(uuid_mutex);
5454
static LIST_HEAD(fs_uuids);
5555

56-
static void lock_chunks(struct btrfs_root *root)
57-
{
58-
mutex_lock(&root->fs_info->chunk_mutex);
59-
}
60-
61-
static void unlock_chunks(struct btrfs_root *root)
62-
{
63-
mutex_unlock(&root->fs_info->chunk_mutex);
64-
}
65-
6656
static struct btrfs_fs_devices *__alloc_fs_devices(void)
6757
{
6858
struct btrfs_fs_devices *fs_devs;
@@ -1068,9 +1058,11 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans,
10681058
u64 *start, u64 len)
10691059
{
10701060
struct extent_map *em;
1061+
struct list_head *search_list = &trans->transaction->pending_chunks;
10711062
int ret = 0;
10721063

1073-
list_for_each_entry(em, &trans->transaction->pending_chunks, list) {
1064+
again:
1065+
list_for_each_entry(em, search_list, list) {
10741066
struct map_lookup *map;
10751067
int i;
10761068

@@ -1087,6 +1079,10 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans,
10871079
ret = 1;
10881080
}
10891081
}
1082+
if (search_list == &trans->transaction->pending_chunks) {
1083+
search_list = &trans->root->fs_info->pinned_chunks;
1084+
goto again;
1085+
}
10901086

10911087
return ret;
10921088
}
@@ -2653,18 +2649,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
26532649
}
26542650
}
26552651

2656-
ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
2652+
ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em);
26572653
if (ret) {
26582654
btrfs_abort_transaction(trans, extent_root, ret);
26592655
goto out;
26602656
}
26612657

2662-
write_lock(&em_tree->lock);
2663-
remove_extent_mapping(em_tree, em);
2664-
write_unlock(&em_tree->lock);
2665-
2666-
/* once for the tree */
2667-
free_extent_map(em);
26682658
out:
26692659
/* once for us */
26702660
free_extent_map(em);

fs/btrfs/volumes.h

+12
Original file line numberDiff line numberDiff line change
@@ -515,4 +515,16 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
515515
void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
516516
void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
517517
struct btrfs_transaction *transaction);
518+
519+
static inline void lock_chunks(struct btrfs_root *root)
520+
{
521+
mutex_lock(&root->fs_info->chunk_mutex);
522+
}
523+
524+
static inline void unlock_chunks(struct btrfs_root *root)
525+
{
526+
mutex_unlock(&root->fs_info->chunk_mutex);
527+
}
528+
529+
518530
#endif

0 commit comments

Comments
 (0)