Skip to content

Commit 9bcfe28

Browse files
yonghong-songkernel-patches-bot
authored andcommitted
If a bucket contains a lot of sockets, during bpf_iter traversing
a bucket, concurrent userspace bpf_map_update_elem() and bpf program bpf_sk_storage_{get,delete}() may experience some undesirable delays as they will compete with bpf_iter for bucket lock. Note that the number of buckets for bpf_sk_storage_map is roughly the same as the number of cpus. So if there are lots of sockets in the system, each bucket could contain lots of sockets. Different actual use cases may experience different delays. Here, using selftest bpf_iter subtest bpf_sk_storage_map, I hacked the kernel with ktime_get_mono_fast_ns() to collect the time when a bucket was locked during bpf_iter prog traversing that bucket. This way, the maximum incurred delay was measured w.r.t. the number of elements in a bucket. # elems in each bucket delay(ns) 64 17000 256 72512 2048 875246 The potential delays will be further increased if we have even more elemnts in a bucket. Using rcu_read_lock() is a reasonable compromise here. It may lose some precision, e.g., access stale sockets, but it will not hurt performance of bpf program or user space application which also tries to get/delete or update map elements. Cc: Martin KaFai Lau <[email protected]> Acked-by: Song Liu <[email protected]> Signed-off-by: Yonghong Song <[email protected]> --- net/core/bpf_sk_storage.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) Changelog: v3 -> v4: - use rcu_dereference/hlist_next_rcu for hlist_entry_safe. (Martin) v2 -> v3: - fix a bug hlist_for_each_entry() => hlist_for_each_entry_rcu(). (Martin) - use rcu_dereference() instead of rcu_dereference_raw() for lockdep checking. (Martin) v1 -> v2: - added some performance number. (Song) - tried to silence some sparse complains. but still has some left like context imbalance in "..." - different lock contexts for basic block which the code is too hard for sparse to analyze. (Jakub)
1 parent 6abaac8 commit 9bcfe28

File tree

1 file changed

+13
-18
lines changed

1 file changed

+13
-18
lines changed

net/core/bpf_sk_storage.c

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,7 @@ struct bpf_iter_seq_sk_storage_map_info {
678678
static struct bpf_local_storage_elem *
679679
bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
680680
struct bpf_local_storage_elem *prev_selem)
681+
__acquires(RCU) __releases(RCU)
681682
{
682683
struct bpf_local_storage *sk_storage;
683684
struct bpf_local_storage_elem *selem;
@@ -696,16 +697,16 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
696697
selem = prev_selem;
697698
count = 0;
698699
while (selem) {
699-
selem = hlist_entry_safe(selem->map_node.next,
700+
selem = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&selem->map_node)),
700701
struct bpf_local_storage_elem, map_node);
701702
if (!selem) {
702703
/* not found, unlock and go to the next bucket */
703704
b = &smap->buckets[bucket_id++];
704-
raw_spin_unlock_bh(&b->lock);
705+
rcu_read_unlock();
705706
skip_elems = 0;
706707
break;
707708
}
708-
sk_storage = rcu_dereference_raw(selem->local_storage);
709+
sk_storage = rcu_dereference(selem->local_storage);
709710
if (sk_storage) {
710711
info->skip_elems = skip_elems + count;
711712
return selem;
@@ -715,18 +716,18 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
715716

716717
for (i = bucket_id; i < (1U << smap->bucket_log); i++) {
717718
b = &smap->buckets[i];
718-
raw_spin_lock_bh(&b->lock);
719+
rcu_read_lock();
719720
count = 0;
720-
hlist_for_each_entry(selem, &b->list, map_node) {
721-
sk_storage = rcu_dereference_raw(selem->local_storage);
721+
hlist_for_each_entry_rcu(selem, &b->list, map_node) {
722+
sk_storage = rcu_dereference(selem->local_storage);
722723
if (sk_storage && count >= skip_elems) {
723724
info->bucket_id = i;
724725
info->skip_elems = count;
725726
return selem;
726727
}
727728
count++;
728729
}
729-
raw_spin_unlock_bh(&b->lock);
730+
rcu_read_unlock();
730731
skip_elems = 0;
731732
}
732733

@@ -785,7 +786,7 @@ static int __bpf_sk_storage_map_seq_show(struct seq_file *seq,
785786
ctx.meta = &meta;
786787
ctx.map = info->map;
787788
if (selem) {
788-
sk_storage = rcu_dereference_raw(selem->local_storage);
789+
sk_storage = rcu_dereference(selem->local_storage);
789790
ctx.sk = sk_storage->owner;
790791
ctx.value = SDATA(selem)->data;
791792
}
@@ -801,18 +802,12 @@ static int bpf_sk_storage_map_seq_show(struct seq_file *seq, void *v)
801802
}
802803

803804
static void bpf_sk_storage_map_seq_stop(struct seq_file *seq, void *v)
805+
__releases(RCU)
804806
{
805-
struct bpf_iter_seq_sk_storage_map_info *info = seq->private;
806-
struct bpf_local_storage_map *smap;
807-
struct bpf_local_storage_map_bucket *b;
808-
809-
if (!v) {
807+
if (!v)
810808
(void)__bpf_sk_storage_map_seq_show(seq, v);
811-
} else {
812-
smap = (struct bpf_local_storage_map *)info->map;
813-
b = &smap->buckets[info->bucket_id];
814-
raw_spin_unlock_bh(&b->lock);
815-
}
809+
else
810+
rcu_read_unlock();
816811
}
817812

818813
static int bpf_iter_init_sk_storage_map(void *priv_data,

0 commit comments

Comments
 (0)