Skip to content

Commit 35d2b7f

Browse files
jrfastabborkmann
authored andcommitted
bpf, sockmap: Fix preempt_rt splat when using raw_spin_lock_t
Sockmap and sockhash maps are a collection of psocks that are objects representing a socket plus a set of metadata needed to manage the BPF programs associated with the socket. These maps use the stab->lock to protect from concurrent operations on the maps, e.g. trying to insert to objects into the array at the same time in the same slot. Additionally, a sockhash map has a bucket lock to protect iteration and insert/delete into the hash entry. Each psock has a psock->link which is a linked list of all the maps that a psock is attached to. This allows a psock (socket) to be included in multiple sockmap and sockhash maps. This linked list is protected the psock->link_lock. They _must_ be nested correctly to avoid deadlock: lock(stab->lock) : do BPF map operations and psock insert/delete lock(psock->link_lock) : add map to psock linked list of maps unlock(psock->link_lock) unlock(stab->lock) For non PREEMPT_RT kernels both raw_spin_lock_t and spin_lock_t are guaranteed to not sleep. But, with PREEMPT_RT kernels the spin_lock_t variants may sleep. In the current code we have many patterns like this: rcu_critical_section: raw_spin_lock(stab->lock) spin_lock(psock->link_lock) <- may sleep ouch spin_unlock(psock->link_lock) raw_spin_unlock(stab->lock) rcu_critical_section Nesting spin_lock() inside a raw_spin_lock() violates locking rules for PREEMPT_RT kernels. And additionally we do alloc(GFP_ATOMICS) inside the stab->lock, but those might sleep on PREEMPT_RT kernels. The result is splats like this: ./test_progs -t sockmap_basic [ 33.344330] bpf_testmod: loading out-of-tree module taints kernel. [ 33.441933] [ 33.442089] ============================= [ 33.442421] [ BUG: Invalid wait context ] [ 33.442763] 6.5.0-rc5-01731-gec0ded2e0282 #4958 Tainted: G O [ 33.443320] ----------------------------- [ 33.443624] test_progs/2073 is trying to lock: [ 33.443960] ffff888102a1c290 (&psock->link_lock){....}-{3:3}, at: sock_map_update_common+0x2c2/0x3d0 [ 33.444636] other info that might help us debug this: [ 33.444991] context-{5:5} [ 33.445183] 3 locks held by test_progs/2073: [ 33.445498] #0: ffff88811a208d30 (sk_lock-AF_INET){+.+.}-{0:0}, at: sock_map_update_elem_sys+0xff/0x330 [ 33.446159] #1: ffffffff842539e0 (rcu_read_lock){....}-{1:3}, at: sock_map_update_elem_sys+0xf5/0x330 [ 33.446809] #2: ffff88810d687240 (&stab->lock){+...}-{2:2}, at: sock_map_update_common+0x177/0x3d0 [ 33.447445] stack backtrace: [ 33.447655] CPU: 10 PID To fix observe we can't readily remove the allocations (for that we would need to use/create something similar to bpf_map_alloc). So convert raw_spin_lock_t to spin_lock_t. We note that sock_map_update that would trigger the allocate and potential sleep is only allowed through sys_bpf ops and via sock_ops which precludes hw interrupts and low level atomic sections in RT preempt kernel. On non RT preempt kernel there are no changes here and spin locks sections and alloc(GFP_ATOMIC) are still not sleepable. Signed-off-by: John Fastabend <[email protected]> Signed-off-by: Daniel Borkmann <[email protected]> Link: https://lore.kernel.org/bpf/[email protected]
1 parent be4033d commit 35d2b7f

File tree

1 file changed

+18
-18
lines changed

1 file changed

+18
-18
lines changed

net/core/sock_map.c

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ struct bpf_stab {
1818
struct bpf_map map;
1919
struct sock **sks;
2020
struct sk_psock_progs progs;
21-
raw_spinlock_t lock;
21+
spinlock_t lock;
2222
};
2323

2424
#define SOCK_CREATE_FLAG_MASK \
@@ -44,7 +44,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
4444
return ERR_PTR(-ENOMEM);
4545

4646
bpf_map_init_from_attr(&stab->map, attr);
47-
raw_spin_lock_init(&stab->lock);
47+
spin_lock_init(&stab->lock);
4848

4949
stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries *
5050
sizeof(struct sock *),
@@ -411,7 +411,7 @@ static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test,
411411
struct sock *sk;
412412
int err = 0;
413413

414-
raw_spin_lock_bh(&stab->lock);
414+
spin_lock_bh(&stab->lock);
415415
sk = *psk;
416416
if (!sk_test || sk_test == sk)
417417
sk = xchg(psk, NULL);
@@ -421,7 +421,7 @@ static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test,
421421
else
422422
err = -EINVAL;
423423

424-
raw_spin_unlock_bh(&stab->lock);
424+
spin_unlock_bh(&stab->lock);
425425
return err;
426426
}
427427

@@ -487,7 +487,7 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx,
487487
psock = sk_psock(sk);
488488
WARN_ON_ONCE(!psock);
489489

490-
raw_spin_lock_bh(&stab->lock);
490+
spin_lock_bh(&stab->lock);
491491
osk = stab->sks[idx];
492492
if (osk && flags == BPF_NOEXIST) {
493493
ret = -EEXIST;
@@ -501,10 +501,10 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx,
501501
stab->sks[idx] = sk;
502502
if (osk)
503503
sock_map_unref(osk, &stab->sks[idx]);
504-
raw_spin_unlock_bh(&stab->lock);
504+
spin_unlock_bh(&stab->lock);
505505
return 0;
506506
out_unlock:
507-
raw_spin_unlock_bh(&stab->lock);
507+
spin_unlock_bh(&stab->lock);
508508
if (psock)
509509
sk_psock_put(sk, psock);
510510
out_free:
@@ -835,7 +835,7 @@ struct bpf_shtab_elem {
835835

836836
struct bpf_shtab_bucket {
837837
struct hlist_head head;
838-
raw_spinlock_t lock;
838+
spinlock_t lock;
839839
};
840840

841841
struct bpf_shtab {
@@ -910,15 +910,15 @@ static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk,
910910
* is okay since it's going away only after RCU grace period.
911911
* However, we need to check whether it's still present.
912912
*/
913-
raw_spin_lock_bh(&bucket->lock);
913+
spin_lock_bh(&bucket->lock);
914914
elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash,
915915
elem->key, map->key_size);
916916
if (elem_probe && elem_probe == elem) {
917917
hlist_del_rcu(&elem->node);
918918
sock_map_unref(elem->sk, elem);
919919
sock_hash_free_elem(htab, elem);
920920
}
921-
raw_spin_unlock_bh(&bucket->lock);
921+
spin_unlock_bh(&bucket->lock);
922922
}
923923

924924
static long sock_hash_delete_elem(struct bpf_map *map, void *key)
@@ -932,15 +932,15 @@ static long sock_hash_delete_elem(struct bpf_map *map, void *key)
932932
hash = sock_hash_bucket_hash(key, key_size);
933933
bucket = sock_hash_select_bucket(htab, hash);
934934

935-
raw_spin_lock_bh(&bucket->lock);
935+
spin_lock_bh(&bucket->lock);
936936
elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
937937
if (elem) {
938938
hlist_del_rcu(&elem->node);
939939
sock_map_unref(elem->sk, elem);
940940
sock_hash_free_elem(htab, elem);
941941
ret = 0;
942942
}
943-
raw_spin_unlock_bh(&bucket->lock);
943+
spin_unlock_bh(&bucket->lock);
944944
return ret;
945945
}
946946

@@ -1000,7 +1000,7 @@ static int sock_hash_update_common(struct bpf_map *map, void *key,
10001000
hash = sock_hash_bucket_hash(key, key_size);
10011001
bucket = sock_hash_select_bucket(htab, hash);
10021002

1003-
raw_spin_lock_bh(&bucket->lock);
1003+
spin_lock_bh(&bucket->lock);
10041004
elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
10051005
if (elem && flags == BPF_NOEXIST) {
10061006
ret = -EEXIST;
@@ -1026,10 +1026,10 @@ static int sock_hash_update_common(struct bpf_map *map, void *key,
10261026
sock_map_unref(elem->sk, elem);
10271027
sock_hash_free_elem(htab, elem);
10281028
}
1029-
raw_spin_unlock_bh(&bucket->lock);
1029+
spin_unlock_bh(&bucket->lock);
10301030
return 0;
10311031
out_unlock:
1032-
raw_spin_unlock_bh(&bucket->lock);
1032+
spin_unlock_bh(&bucket->lock);
10331033
sk_psock_put(sk, psock);
10341034
out_free:
10351035
sk_psock_free_link(link);
@@ -1115,7 +1115,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
11151115

11161116
for (i = 0; i < htab->buckets_num; i++) {
11171117
INIT_HLIST_HEAD(&htab->buckets[i].head);
1118-
raw_spin_lock_init(&htab->buckets[i].lock);
1118+
spin_lock_init(&htab->buckets[i].lock);
11191119
}
11201120

11211121
return &htab->map;
@@ -1147,11 +1147,11 @@ static void sock_hash_free(struct bpf_map *map)
11471147
* exists, psock exists and holds a ref to socket. That
11481148
* lets us to grab a socket ref too.
11491149
*/
1150-
raw_spin_lock_bh(&bucket->lock);
1150+
spin_lock_bh(&bucket->lock);
11511151
hlist_for_each_entry(elem, &bucket->head, node)
11521152
sock_hold(elem->sk);
11531153
hlist_move_list(&bucket->head, &unlink_list);
1154-
raw_spin_unlock_bh(&bucket->lock);
1154+
spin_unlock_bh(&bucket->lock);
11551155

11561156
/* Process removed entries out of atomic context to
11571157
* block for socket lock before deleting the psock's

0 commit comments

Comments
 (0)