Skip to content

Commit b4be6ae

Browse files
josefbacikkdave
authored andcommitted
btrfs: do not start relocation until in progress drops are done
We hit a bug with a recovering relocation on mount for one of our file systems in production. I reproduced this locally by injecting errors into snapshot delete with balance running at the same time. This presented as an error while looking up an extent item WARNING: CPU: 5 PID: 1501 at fs/btrfs/extent-tree.c:866 lookup_inline_extent_backref+0x647/0x680 CPU: 5 PID: 1501 Comm: btrfs-balance Not tainted 5.16.0-rc8+ #8 RIP: 0010:lookup_inline_extent_backref+0x647/0x680 RSP: 0018:ffffae0a023ab960 EFLAGS: 00010202 RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 000000000000000c RDI: 0000000000000000 RBP: ffff943fd2a39b60 R08: 0000000000000000 R09: 0000000000000001 R10: 0001434088152de0 R11: 0000000000000000 R12: 0000000001d05000 R13: ffff943fd2a39b60 R14: ffff943fdb96f2a0 R15: ffff9442fc923000 FS: 0000000000000000(0000) GS:ffff944e9eb40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f1157b1fca8 CR3: 000000010f092000 CR4: 0000000000350ee0 Call Trace: <TASK> insert_inline_extent_backref+0x46/0xd0 __btrfs_inc_extent_ref.isra.0+0x5f/0x200 ? btrfs_merge_delayed_refs+0x164/0x190 __btrfs_run_delayed_refs+0x561/0xfa0 ? btrfs_search_slot+0x7b4/0xb30 ? btrfs_update_root+0x1a9/0x2c0 btrfs_run_delayed_refs+0x73/0x1f0 ? btrfs_update_root+0x1a9/0x2c0 btrfs_commit_transaction+0x50/0xa50 ? btrfs_update_reloc_root+0x122/0x220 prepare_to_merge+0x29f/0x320 relocate_block_group+0x2b8/0x550 btrfs_relocate_block_group+0x1a6/0x350 btrfs_relocate_chunk+0x27/0xe0 btrfs_balance+0x777/0xe60 balance_kthread+0x35/0x50 ? btrfs_balance+0xe60/0xe60 kthread+0x16b/0x190 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x22/0x30 </TASK> Normally snapshot deletion and relocation are excluded from running at the same time by the fs_info->cleaner_mutex. However if we had a pending balance waiting to get the ->cleaner_mutex, and a snapshot deletion was running, and then the box crashed, we would come up in a state where we have a half deleted snapshot. Again, in the normal case the snapshot deletion needs to complete before relocation can start, but in this case relocation could very well start before the snapshot deletion completes, as we simply add the root to the dead roots list and wait for the next time the cleaner runs to clean up the snapshot. Fix this by setting a bit on the fs_info if we have any DEAD_ROOT's that had a pending drop_progress key. If they do then we know we were in the middle of the drop operation and set a flag on the fs_info. Then balance can wait until this flag is cleared to start up again. If there are DEAD_ROOT's that don't have a drop_progress set then we're safe to start balance right away as we'll be properly protected by the cleaner_mutex. CC: [email protected] # 5.10+ Reviewed-by: Filipe Manana <[email protected]> Signed-off-by: Josef Bacik <[email protected]> Reviewed-by: David Sterba <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent a6ab66e commit b4be6ae

File tree

7 files changed

+91
-1
lines changed

7 files changed

+91
-1
lines changed

fs/btrfs/ctree.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,9 @@ enum {
602602
/* Indicate that we want the transaction kthread to commit right now. */
603603
BTRFS_FS_COMMIT_TRANS,
604604

605+
/* Indicate we have half completed snapshot deletions pending. */
606+
BTRFS_FS_UNFINISHED_DROPS,
607+
605608
#if BITS_PER_LONG == 32
606609
/* Indicate if we have error/warn message printed on 32bit systems */
607610
BTRFS_FS_32BIT_ERROR,
@@ -1106,8 +1109,15 @@ enum {
11061109
BTRFS_ROOT_QGROUP_FLUSHING,
11071110
/* We started the orphan cleanup for this root. */
11081111
BTRFS_ROOT_ORPHAN_CLEANUP,
1112+
/* This root has a drop operation that was started previously. */
1113+
BTRFS_ROOT_UNFINISHED_DROP,
11091114
};
11101115

1116+
static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
1117+
{
1118+
clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
1119+
}
1120+
11111121
/*
11121122
* Record swapped tree blocks of a subvolume tree for delayed subtree trace
11131123
* code. For detail check comment in fs/btrfs/qgroup.c.

fs/btrfs/disk-io.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3813,6 +3813,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
38133813

38143814
set_bit(BTRFS_FS_OPEN, &fs_info->flags);
38153815

3816+
/* Kick the cleaner thread so it'll start deleting snapshots. */
3817+
if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
3818+
wake_up_process(fs_info->cleaner_kthread);
3819+
38163820
clear_oneshot:
38173821
btrfs_clear_oneshot_options(fs_info);
38183822
return 0;
@@ -4538,6 +4542,12 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
45384542
*/
45394543
kthread_park(fs_info->cleaner_kthread);
45404544

4545+
/*
4546+
* If we had UNFINISHED_DROPS we could still be processing them, so
4547+
* clear that bit and wake up relocation so it can stop.
4548+
*/
4549+
btrfs_wake_unfinished_drop(fs_info);
4550+
45414551
/* wait for the qgroup rescan worker to stop */
45424552
btrfs_qgroup_wait_for_completion(fs_info, false);
45434553

fs/btrfs/extent-tree.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5622,6 +5622,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
56225622
int ret;
56235623
int level;
56245624
bool root_dropped = false;
5625+
bool unfinished_drop = false;
56255626

56265627
btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
56275628

@@ -5664,6 +5665,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
56645665
* already dropped.
56655666
*/
56665667
set_bit(BTRFS_ROOT_DELETING, &root->state);
5668+
unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
5669+
56675670
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
56685671
level = btrfs_header_level(root->node);
56695672
path->nodes[level] = btrfs_lock_root_node(root);
@@ -5838,6 +5841,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
58385841
kfree(wc);
58395842
btrfs_free_path(path);
58405843
out:
5844+
/*
5845+
* We were an unfinished drop root, check to see if there are any
5846+
* pending, and if not clear and wake up any waiters.
5847+
*/
5848+
if (!err && unfinished_drop)
5849+
btrfs_maybe_wake_unfinished_drop(fs_info);
5850+
58415851
/*
58425852
* So if we need to stop dropping the snapshot for whatever reason we
58435853
* need to make sure to add it back to the dead root list so that we

fs/btrfs/relocation.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3960,6 +3960,19 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
39603960
int rw = 0;
39613961
int err = 0;
39623962

3963+
/*
3964+
* This only gets set if we had a half-deleted snapshot on mount. We
3965+
* cannot allow relocation to start while we're still trying to clean up
3966+
* these pending deletions.
3967+
*/
3968+
ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE);
3969+
if (ret)
3970+
return ret;
3971+
3972+
/* We may have been woken up by close_ctree, so bail if we're closing. */
3973+
if (btrfs_fs_closing(fs_info))
3974+
return -EINTR;
3975+
39633976
bg = btrfs_lookup_block_group(fs_info, group_start);
39643977
if (!bg)
39653978
return -ENOENT;

fs/btrfs/root-tree.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,21 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
278278

279279
WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state));
280280
if (btrfs_root_refs(&root->root_item) == 0) {
281+
struct btrfs_key drop_key;
282+
283+
btrfs_disk_key_to_cpu(&drop_key, &root->root_item.drop_progress);
284+
/*
285+
* If we have a non-zero drop_progress then we know we
286+
* made it partly through deleting this snapshot, and
287+
* thus we need to make sure we block any balance from
288+
* happening until this snapshot is completely dropped.
289+
*/
290+
if (drop_key.objectid != 0 || drop_key.type != 0 ||
291+
drop_key.offset != 0) {
292+
set_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
293+
set_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
294+
}
295+
281296
set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
282297
btrfs_add_dead_root(root);
283298
}

fs/btrfs/transaction.c

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1319,6 +1319,32 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
13191319
return 0;
13201320
}
13211321

1322+
/*
1323+
* If we had a pending drop we need to see if there are any others left in our
1324+
* dead roots list, and if not clear our bit and wake any waiters.
1325+
*/
1326+
void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
1327+
{
1328+
/*
1329+
* We put the drop in progress roots at the front of the list, so if the
1330+
* first entry doesn't have UNFINISHED_DROP set we can wake everybody
1331+
* up.
1332+
*/
1333+
spin_lock(&fs_info->trans_lock);
1334+
if (!list_empty(&fs_info->dead_roots)) {
1335+
struct btrfs_root *root = list_first_entry(&fs_info->dead_roots,
1336+
struct btrfs_root,
1337+
root_list);
1338+
if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) {
1339+
spin_unlock(&fs_info->trans_lock);
1340+
return;
1341+
}
1342+
}
1343+
spin_unlock(&fs_info->trans_lock);
1344+
1345+
btrfs_wake_unfinished_drop(fs_info);
1346+
}
1347+
13221348
/*
13231349
* dead roots are old snapshots that need to be deleted. This allocates
13241350
* a dirty root struct and adds it into the list of dead roots that need to
@@ -1331,7 +1357,12 @@ void btrfs_add_dead_root(struct btrfs_root *root)
13311357
spin_lock(&fs_info->trans_lock);
13321358
if (list_empty(&root->root_list)) {
13331359
btrfs_grab_root(root);
1334-
list_add_tail(&root->root_list, &fs_info->dead_roots);
1360+
1361+
/* We want to process the partially complete drops first. */
1362+
if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state))
1363+
list_add(&root->root_list, &fs_info->dead_roots);
1364+
else
1365+
list_add_tail(&root->root_list, &fs_info->dead_roots);
13351366
}
13361367
spin_unlock(&fs_info->trans_lock);
13371368
}

fs/btrfs/transaction.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid);
216216

217217
void btrfs_add_dead_root(struct btrfs_root *root);
218218
int btrfs_defrag_root(struct btrfs_root *root);
219+
void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info);
219220
int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
220221
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
221222
void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);

0 commit comments

Comments
 (0)