Skip to content

Commit 1bbc621

Browse files
committed
Btrfs: allow block group cache writeout outside critical section in commit
We loop through all of the dirty block groups during commit and write the free space cache. In order to make sure the cache is currect, we do this while no other writers are allowed in the commit. If a large number of block groups are dirty, this can introduce long stalls during the final stages of the commit, which can block new procs trying to change the filesystem. This commit changes the block group cache writeout to take appropriate locks and allow it to run earlier in the commit. We'll still have to redo some of the block groups, but it means we can get most of the work out of the way without blocking the entire FS. Signed-off-by: Chris Mason <[email protected]>
1 parent 2b10826 commit 1bbc621

9 files changed

+341
-37
lines changed

fs/btrfs/ctree.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1491,6 +1491,12 @@ struct btrfs_fs_info {
14911491
struct mutex chunk_mutex;
14921492
struct mutex volume_mutex;
14931493

1494+
/*
1495+
* this is taken to make sure we don't set block groups ro after
1496+
* the free space cache has been allocated on them
1497+
*/
1498+
struct mutex ro_block_group_mutex;
1499+
14941500
/* this is used during read/modify/write to make sure
14951501
* no two ios are trying to mod the same stripe at the same
14961502
* time
@@ -3407,6 +3413,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
34073413
u64 bytenr, u64 num_bytes, u64 parent,
34083414
u64 root_objectid, u64 owner, u64 offset, int no_quota);
34093415

3416+
int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
3417+
struct btrfs_root *root);
34103418
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
34113419
struct btrfs_root *root);
34123420
int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,

fs/btrfs/disk-io.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2572,6 +2572,7 @@ int open_ctree(struct super_block *sb,
25722572
mutex_init(&fs_info->transaction_kthread_mutex);
25732573
mutex_init(&fs_info->cleaner_mutex);
25742574
mutex_init(&fs_info->volume_mutex);
2575+
mutex_init(&fs_info->ro_block_group_mutex);
25752576
init_rwsem(&fs_info->commit_root_sem);
25762577
init_rwsem(&fs_info->cleanup_work_sem);
25772578
init_rwsem(&fs_info->subvol_sem);

fs/btrfs/extent-tree.c

Lines changed: 216 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3298,7 +3298,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
32983298
if (ret)
32993299
goto out_put;
33003300

3301-
ret = btrfs_truncate_free_space_cache(root, trans, inode);
3301+
ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
33023302
if (ret)
33033303
goto out_put;
33043304
}
@@ -3382,20 +3382,156 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
33823382
return 0;
33833383
}
33843384

3385-
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3385+
/*
3386+
* transaction commit does final block group cache writeback during a
3387+
* critical section where nothing is allowed to change the FS. This is
3388+
* required in order for the cache to actually match the block group,
3389+
* but can introduce a lot of latency into the commit.
3390+
*
3391+
* So, btrfs_start_dirty_block_groups is here to kick off block group
3392+
* cache IO. There's a chance we'll have to redo some of it if the
3393+
* block group changes again during the commit, but it greatly reduces
3394+
* the commit latency by getting rid of the easy block groups while
3395+
* we're still allowing others to join the commit.
3396+
*/
3397+
int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
33863398
struct btrfs_root *root)
33873399
{
33883400
struct btrfs_block_group_cache *cache;
33893401
struct btrfs_transaction *cur_trans = trans->transaction;
33903402
int ret = 0;
33913403
int should_put;
3392-
struct btrfs_path *path;
3393-
LIST_HEAD(io);
3404+
struct btrfs_path *path = NULL;
3405+
LIST_HEAD(dirty);
3406+
struct list_head *io = &cur_trans->io_bgs;
33943407
int num_started = 0;
3395-
int num_waited = 0;
3408+
int loops = 0;
3409+
3410+
spin_lock(&cur_trans->dirty_bgs_lock);
3411+
if (!list_empty(&cur_trans->dirty_bgs)) {
3412+
list_splice_init(&cur_trans->dirty_bgs, &dirty);
3413+
}
3414+
spin_unlock(&cur_trans->dirty_bgs_lock);
33963415

3397-
if (list_empty(&cur_trans->dirty_bgs))
3416+
again:
3417+
if (list_empty(&dirty)) {
3418+
btrfs_free_path(path);
33983419
return 0;
3420+
}
3421+
3422+
/*
3423+
* make sure all the block groups on our dirty list actually
3424+
* exist
3425+
*/
3426+
btrfs_create_pending_block_groups(trans, root);
3427+
3428+
if (!path) {
3429+
path = btrfs_alloc_path();
3430+
if (!path)
3431+
return -ENOMEM;
3432+
}
3433+
3434+
while (!list_empty(&dirty)) {
3435+
cache = list_first_entry(&dirty,
3436+
struct btrfs_block_group_cache,
3437+
dirty_list);
3438+
3439+
/*
3440+
* cache_write_mutex is here only to save us from balance
3441+
* deleting this block group while we are writing out the
3442+
* cache
3443+
*/
3444+
mutex_lock(&trans->transaction->cache_write_mutex);
3445+
3446+
/*
3447+
* this can happen if something re-dirties a block
3448+
* group that is already under IO. Just wait for it to
3449+
* finish and then do it all again
3450+
*/
3451+
if (!list_empty(&cache->io_list)) {
3452+
list_del_init(&cache->io_list);
3453+
btrfs_wait_cache_io(root, trans, cache,
3454+
&cache->io_ctl, path,
3455+
cache->key.objectid);
3456+
btrfs_put_block_group(cache);
3457+
}
3458+
3459+
3460+
/*
3461+
* btrfs_wait_cache_io uses the cache->dirty_list to decide
3462+
* if it should update the cache_state. Don't delete
3463+
* until after we wait.
3464+
*
3465+
* Since we're not running in the commit critical section
3466+
* we need the dirty_bgs_lock to protect from update_block_group
3467+
*/
3468+
spin_lock(&cur_trans->dirty_bgs_lock);
3469+
list_del_init(&cache->dirty_list);
3470+
spin_unlock(&cur_trans->dirty_bgs_lock);
3471+
3472+
should_put = 1;
3473+
3474+
cache_save_setup(cache, trans, path);
3475+
3476+
if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3477+
cache->io_ctl.inode = NULL;
3478+
ret = btrfs_write_out_cache(root, trans, cache, path);
3479+
if (ret == 0 && cache->io_ctl.inode) {
3480+
num_started++;
3481+
should_put = 0;
3482+
3483+
/*
3484+
* the cache_write_mutex is protecting
3485+
* the io_list
3486+
*/
3487+
list_add_tail(&cache->io_list, io);
3488+
} else {
3489+
/*
3490+
* if we failed to write the cache, the
3491+
* generation will be bad and life goes on
3492+
*/
3493+
ret = 0;
3494+
}
3495+
}
3496+
if (!ret)
3497+
ret = write_one_cache_group(trans, root, path, cache);
3498+
mutex_unlock(&trans->transaction->cache_write_mutex);
3499+
3500+
/* if its not on the io list, we need to put the block group */
3501+
if (should_put)
3502+
btrfs_put_block_group(cache);
3503+
3504+
if (ret)
3505+
break;
3506+
}
3507+
3508+
/*
3509+
* go through delayed refs for all the stuff we've just kicked off
3510+
* and then loop back (just once)
3511+
*/
3512+
ret = btrfs_run_delayed_refs(trans, root, 0);
3513+
if (!ret && loops == 0) {
3514+
loops++;
3515+
spin_lock(&cur_trans->dirty_bgs_lock);
3516+
list_splice_init(&cur_trans->dirty_bgs, &dirty);
3517+
spin_unlock(&cur_trans->dirty_bgs_lock);
3518+
goto again;
3519+
}
3520+
3521+
btrfs_free_path(path);
3522+
return ret;
3523+
}
3524+
3525+
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3526+
struct btrfs_root *root)
3527+
{
3528+
struct btrfs_block_group_cache *cache;
3529+
struct btrfs_transaction *cur_trans = trans->transaction;
3530+
int ret = 0;
3531+
int should_put;
3532+
struct btrfs_path *path;
3533+
struct list_head *io = &cur_trans->io_bgs;
3534+
int num_started = 0;
33993535

34003536
path = btrfs_alloc_path();
34013537
if (!path)
@@ -3423,14 +3559,16 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
34233559
&cache->io_ctl, path,
34243560
cache->key.objectid);
34253561
btrfs_put_block_group(cache);
3426-
num_waited++;
34273562
}
34283563

3564+
/*
3565+
* don't remove from the dirty list until after we've waited
3566+
* on any pending IO
3567+
*/
34293568
list_del_init(&cache->dirty_list);
34303569
should_put = 1;
34313570

3432-
if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3433-
cache_save_setup(cache, trans, path);
3571+
cache_save_setup(cache, trans, path);
34343572

34353573
if (!ret)
34363574
ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
@@ -3441,7 +3579,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
34413579
if (ret == 0 && cache->io_ctl.inode) {
34423580
num_started++;
34433581
should_put = 0;
3444-
list_add_tail(&cache->io_list, &io);
3582+
list_add_tail(&cache->io_list, io);
34453583
} else {
34463584
/*
34473585
* if we failed to write the cache, the
@@ -3458,11 +3596,10 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
34583596
btrfs_put_block_group(cache);
34593597
}
34603598

3461-
while (!list_empty(&io)) {
3462-
cache = list_first_entry(&io, struct btrfs_block_group_cache,
3599+
while (!list_empty(io)) {
3600+
cache = list_first_entry(io, struct btrfs_block_group_cache,
34633601
io_list);
34643602
list_del_init(&cache->io_list);
3465-
num_waited++;
34663603
btrfs_wait_cache_io(root, trans, cache,
34673604
&cache->io_ctl, path, cache->key.objectid);
34683605
btrfs_put_block_group(cache);
@@ -5459,15 +5596,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
54595596
if (!alloc && cache->cached == BTRFS_CACHE_NO)
54605597
cache_block_group(cache, 1);
54615598

5462-
spin_lock(&trans->transaction->dirty_bgs_lock);
5463-
if (list_empty(&cache->dirty_list)) {
5464-
list_add_tail(&cache->dirty_list,
5465-
&trans->transaction->dirty_bgs);
5466-
trans->transaction->num_dirty_bgs++;
5467-
btrfs_get_block_group(cache);
5468-
}
5469-
spin_unlock(&trans->transaction->dirty_bgs_lock);
5470-
54715599
byte_in_group = bytenr - cache->key.objectid;
54725600
WARN_ON(byte_in_group > cache->key.offset);
54735601

@@ -5516,6 +5644,16 @@ static int update_block_group(struct btrfs_trans_handle *trans,
55165644
spin_unlock(&info->unused_bgs_lock);
55175645
}
55185646
}
5647+
5648+
spin_lock(&trans->transaction->dirty_bgs_lock);
5649+
if (list_empty(&cache->dirty_list)) {
5650+
list_add_tail(&cache->dirty_list,
5651+
&trans->transaction->dirty_bgs);
5652+
trans->transaction->num_dirty_bgs++;
5653+
btrfs_get_block_group(cache);
5654+
}
5655+
spin_unlock(&trans->transaction->dirty_bgs_lock);
5656+
55195657
btrfs_put_block_group(cache);
55205658
total -= num_bytes;
55215659
bytenr += num_bytes;
@@ -8602,10 +8740,30 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
86028740

86038741
BUG_ON(cache->ro);
86048742

8743+
again:
86058744
trans = btrfs_join_transaction(root);
86068745
if (IS_ERR(trans))
86078746
return PTR_ERR(trans);
86088747

8748+
/*
8749+
* we're not allowed to set block groups readonly after the dirty
8750+
* block groups cache has started writing. If it already started,
8751+
* back off and let this transaction commit
8752+
*/
8753+
mutex_lock(&root->fs_info->ro_block_group_mutex);
8754+
if (trans->transaction->dirty_bg_run) {
8755+
u64 transid = trans->transid;
8756+
8757+
mutex_unlock(&root->fs_info->ro_block_group_mutex);
8758+
btrfs_end_transaction(trans, root);
8759+
8760+
ret = btrfs_wait_for_commit(root, transid);
8761+
if (ret)
8762+
return ret;
8763+
goto again;
8764+
}
8765+
8766+
86098767
ret = set_block_group_ro(cache, 0);
86108768
if (!ret)
86118769
goto out;
@@ -8620,6 +8778,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
86208778
alloc_flags = update_block_group_flags(root, cache->flags);
86218779
check_system_chunk(trans, root, alloc_flags);
86228780
}
8781+
mutex_unlock(&root->fs_info->ro_block_group_mutex);
86238782

86248783
btrfs_end_transaction(trans, root);
86258784
return ret;
@@ -9425,7 +9584,38 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
94259584
goto out;
94269585
}
94279586

9587+
/*
9588+
* get the inode first so any iput calls done for the io_list
9589+
* aren't the final iput (no unlinks allowed now)
9590+
*/
94289591
inode = lookup_free_space_inode(tree_root, block_group, path);
9592+
9593+
mutex_lock(&trans->transaction->cache_write_mutex);
9594+
/*
9595+
* make sure our free spache cache IO is done before remove the
9596+
* free space inode
9597+
*/
9598+
spin_lock(&trans->transaction->dirty_bgs_lock);
9599+
if (!list_empty(&block_group->io_list)) {
9600+
list_del_init(&block_group->io_list);
9601+
9602+
WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
9603+
9604+
spin_unlock(&trans->transaction->dirty_bgs_lock);
9605+
btrfs_wait_cache_io(root, trans, block_group,
9606+
&block_group->io_ctl, path,
9607+
block_group->key.objectid);
9608+
btrfs_put_block_group(block_group);
9609+
spin_lock(&trans->transaction->dirty_bgs_lock);
9610+
}
9611+
9612+
if (!list_empty(&block_group->dirty_list)) {
9613+
list_del_init(&block_group->dirty_list);
9614+
btrfs_put_block_group(block_group);
9615+
}
9616+
spin_unlock(&trans->transaction->dirty_bgs_lock);
9617+
mutex_unlock(&trans->transaction->cache_write_mutex);
9618+
94299619
if (!IS_ERR(inode)) {
94309620
ret = btrfs_orphan_add(trans, inode);
94319621
if (ret) {
@@ -9518,11 +9708,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
95189708

95199709
spin_lock(&trans->transaction->dirty_bgs_lock);
95209710
if (!list_empty(&block_group->dirty_list)) {
9521-
list_del_init(&block_group->dirty_list);
9522-
btrfs_put_block_group(block_group);
9711+
WARN_ON(1);
9712+
}
9713+
if (!list_empty(&block_group->io_list)) {
9714+
WARN_ON(1);
95239715
}
95249716
spin_unlock(&trans->transaction->dirty_bgs_lock);
9525-
95269717
btrfs_remove_free_space_cache(block_group);
95279718

95289719
spin_lock(&block_group->space_info->lock);

0 commit comments

Comments
 (0)