Skip to content

Commit 8faba50

Browse files
fdmananagregkh
authored andcommitted
btrfs: fix swap file activation failure due to extents that used to be shared
commit 03018e5 upstream. When activating a swap file, to determine if an extent is shared we use can_nocow_extent(), which ends up at btrfs_cross_ref_exist(). That helper is meant to be quick because it's used in the NOCOW write path, when flushing delalloc and when doing a direct IO write, however it does return some false positives, meaning it may indicate that an extent is shared even if it's no longer the case. For the write path this is fine, we just do a unnecessary COW operation instead of doing a more rigorous check which would be too heavy (calling btrfs_is_data_extent_shared()). However when activating a swap file, the false positives simply result in a failure, which is confusing for users/applications. One particular case where this happens is when a data extent only has 1 reference but that reference is not inlined in the extent item located in the extent tree - this happens when we create more than 33 references for an extent and then delete those 33 references plus every other non-inline reference except one. The function check_committed_ref() assumes that if the size of an extent item doesn't match the size of struct btrfs_extent_item plus the size of an inline reference (plus an owner reference in case simple quotas are enabled), then the extent is shared - that is not the case however, we can have a single reference but it's not inlined - the reason we do this is to be fast and avoid inspecting non-inline references which may be located in another leaf of the extent tree, slowing down write paths. The following test script reproduces the bug: $ cat test.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi NUM_CLONES=50 umount $DEV &> /dev/null run_test() { local sync_after_add_reflinks=$1 local sync_after_remove_reflinks=$2 mkfs.btrfs -f $DEV > /dev/null #mkfs.xfs -f $DEV > /dev/null mount $DEV $MNT touch $MNT/foo chmod 0600 $MNT/foo # On btrfs the file must be NOCOW. chattr +C $MNT/foo &> /dev/null xfs_io -s -c "pwrite -b 1M 0 1M" $MNT/foo mkswap $MNT/foo for ((i = 1; i <= $NUM_CLONES; i++)); do touch $MNT/foo_clone_$i chmod 0600 $MNT/foo_clone_$i # On btrfs the file must be NOCOW. chattr +C $MNT/foo_clone_$i &> /dev/null cp --reflink=always $MNT/foo $MNT/foo_clone_$i done if [ $sync_after_add_reflinks -ne 0 ]; then # Flush delayed refs and commit current transaction. sync -f $MNT fi # Remove the original file and all clones except the last. rm -f $MNT/foo for ((i = 1; i < $NUM_CLONES; i++)); do rm -f $MNT/foo_clone_$i done if [ $sync_after_remove_reflinks -ne 0 ]; then # Flush delayed refs and commit current transaction. sync -f $MNT fi # Now use the last clone as a swap file. It should work since # its extent are not shared anymore. swapon $MNT/foo_clone_${NUM_CLONES} swapoff $MNT/foo_clone_${NUM_CLONES} umount $MNT } echo -e "\nTest without sync after creating and removing clones" run_test 0 0 echo -e "\nTest with sync after creating clones" run_test 1 0 echo -e "\nTest with sync after removing clones" run_test 0 1 echo -e "\nTest with sync after creating and removing clones" run_test 1 1 Running the test: $ ./test.sh Test without sync after creating and removing clones wrote 1048576/1048576 bytes at offset 0 1 MiB, 1 ops; 0.0017 sec (556.793 MiB/sec and 556.7929 ops/sec) Setting up swapspace version 1, size = 1020 KiB (1044480 bytes) no label, UUID=a6b9c29e-5ef4-4689-a8ac-bc199c750f02 swapon: /mnt/sdi/foo_clone_50: swapon failed: Invalid argument swapoff: /mnt/sdi/foo_clone_50: swapoff failed: Invalid argument Test with sync after creating clones wrote 1048576/1048576 bytes at offset 0 1 MiB, 1 ops; 0.0036 sec (271.739 MiB/sec and 271.7391 ops/sec) Setting up swapspace version 1, size = 1020 KiB (1044480 bytes) no label, UUID=5e9008d6-1f7a-4948-a1b4-3f30aba20a33 swapon: /mnt/sdi/foo_clone_50: swapon failed: Invalid argument swapoff: /mnt/sdi/foo_clone_50: swapoff failed: Invalid argument Test with sync after removing clones wrote 1048576/1048576 bytes at offset 0 1 MiB, 1 ops; 0.0103 sec (96.665 MiB/sec and 96.6651 ops/sec) Setting up swapspace version 1, size = 1020 KiB (1044480 bytes) no label, UUID=916c2740-fa9f-4385-9f06-29c3f89e4764 Test with sync after creating and removing clones wrote 1048576/1048576 bytes at offset 0 1 MiB, 1 ops; 0.0031 sec (314.268 MiB/sec and 314.2678 ops/sec) Setting up swapspace version 1, size = 1020 KiB (1044480 bytes) no label, UUID=06aab1dd-4d90-49c0-bd9f-3a8db4e2f912 swapon: /mnt/sdi/foo_clone_50: swapon failed: Invalid argument swapoff: /mnt/sdi/foo_clone_50: swapoff failed: Invalid argument Fix this by reworking btrfs_swap_activate() to instead of using extent maps and checking for shared extents with can_nocow_extent(), iterate over the inode's file extent items and use the accurate btrfs_is_data_extent_shared(). CC: [email protected] # 5.4+ Reviewed-by: Qu Wenruo <[email protected]> Signed-off-by: Filipe Manana <[email protected]> Signed-off-by: David Sterba <[email protected]> Signed-off-by: Greg Kroah-Hartman <[email protected]>
1 parent 9f372e8 commit 8faba50

File tree

1 file changed

+69
-27
lines changed

1 file changed

+69
-27
lines changed

fs/btrfs/inode.c

Lines changed: 69 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -9782,15 +9782,16 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
97829782
struct btrfs_fs_info *fs_info = root->fs_info;
97839783
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
97849784
struct extent_state *cached_state = NULL;
9785-
struct extent_map *em = NULL;
97869785
struct btrfs_chunk_map *map = NULL;
97879786
struct btrfs_device *device = NULL;
97889787
struct btrfs_swap_info bsi = {
97899788
.lowest_ppage = (sector_t)-1ULL,
97909789
};
9790+
struct btrfs_backref_share_check_ctx *backref_ctx = NULL;
9791+
struct btrfs_path *path = NULL;
97919792
int ret = 0;
97929793
u64 isize;
9793-
u64 start;
9794+
u64 prev_extent_end = 0;
97949795

97959796
/*
97969797
* Acquire the inode's mmap lock to prevent races with memory mapped
@@ -9829,6 +9830,13 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
98299830
goto out_unlock_mmap;
98309831
}
98319832

9833+
path = btrfs_alloc_path();
9834+
backref_ctx = btrfs_alloc_backref_share_check_ctx();
9835+
if (!path || !backref_ctx) {
9836+
ret = -ENOMEM;
9837+
goto out_unlock_mmap;
9838+
}
9839+
98329840
/*
98339841
* Balance or device remove/replace/resize can move stuff around from
98349842
* under us. The exclop protection makes sure they aren't running/won't
@@ -9887,24 +9895,39 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
98879895
isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
98889896

98899897
lock_extent(io_tree, 0, isize - 1, &cached_state);
9890-
start = 0;
9891-
while (start < isize) {
9892-
u64 logical_block_start, physical_block_start;
9898+
while (prev_extent_end < isize) {
9899+
struct btrfs_key key;
9900+
struct extent_buffer *leaf;
9901+
struct btrfs_file_extent_item *ei;
98939902
struct btrfs_block_group *bg;
9894-
u64 len = isize - start;
9903+
u64 logical_block_start;
9904+
u64 physical_block_start;
9905+
u64 extent_gen;
9906+
u64 disk_bytenr;
9907+
u64 len;
98959908

9896-
em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
9897-
if (IS_ERR(em)) {
9898-
ret = PTR_ERR(em);
9909+
key.objectid = btrfs_ino(BTRFS_I(inode));
9910+
key.type = BTRFS_EXTENT_DATA_KEY;
9911+
key.offset = prev_extent_end;
9912+
9913+
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
9914+
if (ret < 0)
98999915
goto out;
9900-
}
99019916

9902-
if (em->disk_bytenr == EXTENT_MAP_HOLE) {
9917+
/*
9918+
* If key not found it means we have an implicit hole (NO_HOLES
9919+
* is enabled).
9920+
*/
9921+
if (ret > 0) {
99039922
btrfs_warn(fs_info, "swapfile must not have holes");
99049923
ret = -EINVAL;
99059924
goto out;
99069925
}
9907-
if (em->disk_bytenr == EXTENT_MAP_INLINE) {
9926+
9927+
leaf = path->nodes[0];
9928+
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
9929+
9930+
if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
99089931
/*
99099932
* It's unlikely we'll ever actually find ourselves
99109933
* here, as a file small enough to fit inline won't be
@@ -9916,23 +9939,45 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
99169939
ret = -EINVAL;
99179940
goto out;
99189941
}
9919-
if (extent_map_is_compressed(em)) {
9942+
9943+
if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
99209944
btrfs_warn(fs_info, "swapfile must not be compressed");
99219945
ret = -EINVAL;
99229946
goto out;
99239947
}
99249948

9925-
logical_block_start = extent_map_block_start(em) + (start - em->start);
9926-
len = min(len, em->len - (start - em->start));
9927-
free_extent_map(em);
9928-
em = NULL;
9949+
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
9950+
if (disk_bytenr == 0) {
9951+
btrfs_warn(fs_info, "swapfile must not have holes");
9952+
ret = -EINVAL;
9953+
goto out;
9954+
}
9955+
9956+
logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei);
9957+
extent_gen = btrfs_file_extent_generation(leaf, ei);
9958+
prev_extent_end = btrfs_file_extent_end(path);
9959+
9960+
if (prev_extent_end > isize)
9961+
len = isize - key.offset;
9962+
else
9963+
len = btrfs_file_extent_num_bytes(leaf, ei);
99299964

9930-
ret = can_nocow_extent(inode, start, &len, NULL, false, true);
9965+
backref_ctx->curr_leaf_bytenr = leaf->start;
9966+
9967+
/*
9968+
* Don't need the path anymore, release to avoid deadlocks when
9969+
* calling btrfs_is_data_extent_shared() because when joining a
9970+
* transaction it can block waiting for the current one's commit
9971+
* which in turn may be trying to lock the same leaf to flush
9972+
* delayed items for example.
9973+
*/
9974+
btrfs_release_path(path);
9975+
9976+
ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr,
9977+
extent_gen, backref_ctx);
99319978
if (ret < 0) {
99329979
goto out;
9933-
} else if (ret) {
9934-
ret = 0;
9935-
} else {
9980+
} else if (ret > 0) {
99369981
btrfs_warn(fs_info,
99379982
"swapfile must not be copy-on-write");
99389983
ret = -EINVAL;
@@ -9967,7 +10012,6 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
996710012

996810013
physical_block_start = (map->stripes[0].physical +
996910014
(logical_block_start - map->start));
9970-
len = min(len, map->chunk_len - (logical_block_start - map->start));
997110015
btrfs_free_chunk_map(map);
997210016
map = NULL;
997310017

@@ -10008,20 +10052,16 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
1000810052
if (ret)
1000910053
goto out;
1001010054
}
10011-
bsi.start = start;
10055+
bsi.start = key.offset;
1001210056
bsi.block_start = physical_block_start;
1001310057
bsi.block_len = len;
1001410058
}
10015-
10016-
start += len;
1001710059
}
1001810060

1001910061
if (bsi.block_len)
1002010062
ret = btrfs_add_swap_extent(sis, &bsi);
1002110063

1002210064
out:
10023-
if (!IS_ERR_OR_NULL(em))
10024-
free_extent_map(em);
1002510065
if (!IS_ERR_OR_NULL(map))
1002610066
btrfs_free_chunk_map(map);
1002710067

@@ -10036,6 +10076,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
1003610076

1003710077
out_unlock_mmap:
1003810078
up_write(&BTRFS_I(inode)->i_mmap_lock);
10079+
btrfs_free_backref_share_ctx(backref_ctx);
10080+
btrfs_free_path(path);
1003910081
if (ret)
1004010082
return ret;
1004110083

0 commit comments

Comments
 (0)