Skip to content

Commit 9804985

Browse files
q2vendavem330
authored andcommitted
udp: Introduce optional per-netns hash table.
The maximum hash table size is 64K due to the nature of the protocol. [0] It's smaller than TCP, and fewer sockets can cause a performance drop. On an EC2 c5.24xlarge instance (192 GiB memory), after running iperf3 in different netns, creating 32Mi sockets without data transfer in the root netns causes regression for the iperf3's connection. uhash_entries sockets length Gbps 64K 1 1 5.69 1Mi 16 5.27 2Mi 32 4.90 4Mi 64 4.09 8Mi 128 2.96 16Mi 256 2.06 32Mi 512 1.12 The per-netns hash table breaks the lengthy lists into shorter ones. It is useful on a multi-tenant system with thousands of netns. With smaller hash tables, we can look up sockets faster, isolate noisy neighbours, and reduce lock contention. The max size of the per-netns table is 64K as well. This is because the possible hash range by udp_hashfn() always fits in 64K within the same netns and we cannot make full use of the whole buckets larger than 64K. /* 0 < num < 64K -> X < hash < X + 64K */ (num + net_hash_mix(net)) & mask; Also, the min size is 128. We use a bitmap to search for an available port in udp_lib_get_port(). To keep the bitmap on the stack and not fire the CONFIG_FRAME_WARN error at build time, we round up the table size to 128. The sysctl usage is the same with TCP: $ dmesg | cut -d ' ' -f 6- | grep "UDP hash" UDP hash table entries: 65536 (order: 9, 2097152 bytes, vmalloc) # sysctl net.ipv4.udp_hash_entries net.ipv4.udp_hash_entries = 65536 # can be changed by uhash_entries # sysctl net.ipv4.udp_child_hash_entries net.ipv4.udp_child_hash_entries = 0 # disabled by default # ip netns add test1 # ip netns exec test1 sysctl net.ipv4.udp_hash_entries net.ipv4.udp_hash_entries = -65536 # share the global table # sysctl -w net.ipv4.udp_child_hash_entries=100 net.ipv4.udp_child_hash_entries = 100 # ip netns add test2 # ip netns exec test2 sysctl net.ipv4.udp_hash_entries net.ipv4.udp_hash_entries = 128 # own a per-netns table with 2^n buckets We could optimise the hash table lookup/iteration further by removing the netns comparison for the per-netns one in the future. Also, we could optimise the sparse udp_hslot layout by putting it in udp_table. [0]: https://lore.kernel.org/netdev/[email protected]/ Signed-off-by: Kuniyuki Iwashima <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent ba6aac1 commit 9804985

File tree

5 files changed

+166
-6
lines changed

5 files changed

+166
-6
lines changed

Documentation/networking/ip-sysctl.rst

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1177,6 +1177,33 @@ udp_rmem_min - INTEGER
11771177
udp_wmem_min - INTEGER
11781178
UDP does not have tx memory accounting and this tunable has no effect.
11791179

1180+
udp_hash_entries - INTEGER
1181+
Show the number of hash buckets for UDP sockets in the current
1182+
networking namespace.
1183+
1184+
A negative value means the networking namespace does not own its
1185+
hash buckets and shares the initial networking namespace's one.
1186+
1187+
udp_child_ehash_entries - INTEGER
1188+
Control the number of hash buckets for UDP sockets in the child
1189+
networking namespace, which must be set before clone() or unshare().
1190+
1191+
If the value is not 0, the kernel uses a value rounded up to 2^n
1192+
as the actual hash bucket size. 0 is a special value, meaning
1193+
the child networking namespace will share the initial networking
1194+
namespace's hash buckets.
1195+
1196+
Note that the child will use the global one in case the kernel
1197+
fails to allocate enough memory. In addition, the global hash
1198+
buckets are spread over available NUMA nodes, but the allocation
1199+
of the child hash table depends on the current process's NUMA
1200+
policy, which could result in performance differences.
1201+
1202+
Possible values: 0, 2^n (n: 7 (128) - 16 (64K))
1203+
1204+
Default: 0
1205+
1206+
11801207
RAW variables
11811208
=============
11821209

include/linux/udp.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ static inline struct udphdr *udp_hdr(const struct sk_buff *skb)
2323
return (struct udphdr *)skb_transport_header(skb);
2424
}
2525

26+
#define UDP_HTABLE_SIZE_MIN_PERNET 128
2627
#define UDP_HTABLE_SIZE_MIN (CONFIG_BASE_SMALL ? 128 : 256)
28+
#define UDP_HTABLE_SIZE_MAX 65536
2729

2830
static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
2931
{

include/net/netns/ipv4.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,8 @@ struct netns_ipv4 {
208208

209209
atomic_t dev_addr_genid;
210210

211+
unsigned int sysctl_udp_child_hash_entries;
212+
211213
#ifdef CONFIG_SYSCTL
212214
unsigned long *sysctl_local_reserved_ports;
213215
int sysctl_ip_prot_sock;

net/ipv4/sysctl_net_ipv4.c

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ static int one_day_secs = 24 * 3600;
4040
static u32 fib_multipath_hash_fields_all_mask __maybe_unused =
4141
FIB_MULTIPATH_HASH_FIELD_ALL_MASK;
4242
static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024;
43+
static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
4344
static int tcp_plb_max_rounds = 31;
4445
static int tcp_plb_max_cong_thresh = 256;
4546

@@ -402,12 +403,36 @@ static int proc_tcp_ehash_entries(struct ctl_table *table, int write,
402403
if (!net_eq(net, &init_net) && !hinfo->pernet)
403404
tcp_ehash_entries *= -1;
404405

406+
memset(&tbl, 0, sizeof(tbl));
405407
tbl.data = &tcp_ehash_entries;
406408
tbl.maxlen = sizeof(int);
407409

408410
return proc_dointvec(&tbl, write, buffer, lenp, ppos);
409411
}
410412

413+
static int proc_udp_hash_entries(struct ctl_table *table, int write,
414+
void *buffer, size_t *lenp, loff_t *ppos)
415+
{
416+
struct net *net = container_of(table->data, struct net,
417+
ipv4.sysctl_udp_child_hash_entries);
418+
int udp_hash_entries;
419+
struct ctl_table tbl;
420+
421+
udp_hash_entries = net->ipv4.udp_table->mask + 1;
422+
423+
/* A negative number indicates that the child netns
424+
* shares the global udp_table.
425+
*/
426+
if (!net_eq(net, &init_net) && net->ipv4.udp_table == &udp_table)
427+
udp_hash_entries *= -1;
428+
429+
memset(&tbl, 0, sizeof(tbl));
430+
tbl.data = &udp_hash_entries;
431+
tbl.maxlen = sizeof(int);
432+
433+
return proc_dointvec(&tbl, write, buffer, lenp, ppos);
434+
}
435+
411436
#ifdef CONFIG_IP_ROUTE_MULTIPATH
412437
static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write,
413438
void *buffer, size_t *lenp,
@@ -1361,6 +1386,21 @@ static struct ctl_table ipv4_net_table[] = {
13611386
.extra1 = SYSCTL_ZERO,
13621387
.extra2 = &tcp_child_ehash_entries_max,
13631388
},
1389+
{
1390+
.procname = "udp_hash_entries",
1391+
.data = &init_net.ipv4.sysctl_udp_child_hash_entries,
1392+
.mode = 0444,
1393+
.proc_handler = proc_udp_hash_entries,
1394+
},
1395+
{
1396+
.procname = "udp_child_hash_entries",
1397+
.data = &init_net.ipv4.sysctl_udp_child_hash_entries,
1398+
.maxlen = sizeof(unsigned int),
1399+
.mode = 0644,
1400+
.proc_handler = proc_douintvec_minmax,
1401+
.extra1 = SYSCTL_ZERO,
1402+
.extra2 = &udp_child_hash_entries_max,
1403+
},
13641404
{
13651405
.procname = "udp_rmem_min",
13661406
.data = &init_net.ipv4.sysctl_udp_rmem_min,

net/ipv4/udp.c

Lines changed: 95 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);
129129
EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc);
130130

131131
#define MAX_UDP_PORTS 65536
132-
#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
132+
#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN_PERNET)
133133

134134
static struct udp_table *udp_get_table_prot(struct sock *sk)
135135
{
@@ -3277,7 +3277,7 @@ void __init udp_table_init(struct udp_table *table, const char *name)
32773277
&table->log,
32783278
&table->mask,
32793279
UDP_HTABLE_SIZE_MIN,
3280-
64 * 1024);
3280+
UDP_HTABLE_SIZE_MAX);
32813281

32823282
table->hash2 = table->hash + (table->mask + 1);
32833283
for (i = 0; i <= table->mask; i++) {
@@ -3302,22 +3302,111 @@ u32 udp_flow_hashrnd(void)
33023302
}
33033303
EXPORT_SYMBOL(udp_flow_hashrnd);
33043304

3305-
static int __net_init udp_sysctl_init(struct net *net)
3305+
static void __net_init udp_sysctl_init(struct net *net)
33063306
{
3307-
net->ipv4.udp_table = &udp_table;
3308-
33093307
net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE;
33103308
net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE;
33113309

33123310
#ifdef CONFIG_NET_L3_MASTER_DEV
33133311
net->ipv4.sysctl_udp_l3mdev_accept = 0;
33143312
#endif
3313+
}
3314+
3315+
static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_entries)
3316+
{
3317+
struct udp_table *udptable;
3318+
int i;
3319+
3320+
udptable = kmalloc(sizeof(*udptable), GFP_KERNEL);
3321+
if (!udptable)
3322+
goto out;
3323+
3324+
udptable->hash = vmalloc_huge(hash_entries * 2 * sizeof(struct udp_hslot),
3325+
GFP_KERNEL_ACCOUNT);
3326+
if (!udptable->hash)
3327+
goto free_table;
3328+
3329+
udptable->hash2 = udptable->hash + hash_entries;
3330+
udptable->mask = hash_entries - 1;
3331+
udptable->log = ilog2(hash_entries);
3332+
3333+
for (i = 0; i < hash_entries; i++) {
3334+
INIT_HLIST_HEAD(&udptable->hash[i].head);
3335+
udptable->hash[i].count = 0;
3336+
spin_lock_init(&udptable->hash[i].lock);
3337+
3338+
INIT_HLIST_HEAD(&udptable->hash2[i].head);
3339+
udptable->hash2[i].count = 0;
3340+
spin_lock_init(&udptable->hash2[i].lock);
3341+
}
3342+
3343+
return udptable;
3344+
3345+
free_table:
3346+
kfree(udptable);
3347+
out:
3348+
return NULL;
3349+
}
3350+
3351+
static void __net_exit udp_pernet_table_free(struct net *net)
3352+
{
3353+
struct udp_table *udptable = net->ipv4.udp_table;
3354+
3355+
if (udptable == &udp_table)
3356+
return;
3357+
3358+
kvfree(udptable->hash);
3359+
kfree(udptable);
3360+
}
3361+
3362+
static void __net_init udp_set_table(struct net *net)
3363+
{
3364+
struct udp_table *udptable;
3365+
unsigned int hash_entries;
3366+
struct net *old_net;
3367+
3368+
if (net_eq(net, &init_net))
3369+
goto fallback;
3370+
3371+
old_net = current->nsproxy->net_ns;
3372+
hash_entries = READ_ONCE(old_net->ipv4.sysctl_udp_child_hash_entries);
3373+
if (!hash_entries)
3374+
goto fallback;
3375+
3376+
/* Set min to keep the bitmap on stack in udp_lib_get_port() */
3377+
if (hash_entries < UDP_HTABLE_SIZE_MIN_PERNET)
3378+
hash_entries = UDP_HTABLE_SIZE_MIN_PERNET;
3379+
else
3380+
hash_entries = roundup_pow_of_two(hash_entries);
3381+
3382+
udptable = udp_pernet_table_alloc(hash_entries);
3383+
if (udptable) {
3384+
net->ipv4.udp_table = udptable;
3385+
} else {
3386+
pr_warn("Failed to allocate UDP hash table (entries: %u) "
3387+
"for a netns, fallback to the global one\n",
3388+
hash_entries);
3389+
fallback:
3390+
net->ipv4.udp_table = &udp_table;
3391+
}
3392+
}
3393+
3394+
static int __net_init udp_pernet_init(struct net *net)
3395+
{
3396+
udp_sysctl_init(net);
3397+
udp_set_table(net);
33153398

33163399
return 0;
33173400
}
33183401

3402+
static void __net_exit udp_pernet_exit(struct net *net)
3403+
{
3404+
udp_pernet_table_free(net);
3405+
}
3406+
33193407
static struct pernet_operations __net_initdata udp_sysctl_ops = {
3320-
.init = udp_sysctl_init,
3408+
.init = udp_pernet_init,
3409+
.exit = udp_pernet_exit,
33213410
};
33223411

33233412
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)

0 commit comments

Comments
 (0)