Skip to content

Commit 2bf3660

Browse files
kdrag0nNipaLocal
authored and
NipaLocal
committed
net: fully namespace net.core.{r,w}mem_{default,max} sysctls
This builds on commit 19249c0 ("net: make net.core.{r,w}mem_{default,max} namespaced") by adding support for writing the sysctls from within net namespaces, rather than only reading the values that were set in init_net. These are relatively commonly-used sysctls, so programs may try to set them without knowing that they're in a container. It can be surprising for such attempts to fail with EACCES. Unlike other net sysctls that were converted to namespaced ones, many systems have a sysctl.conf (or other configs) that globally write to net.core.rmem_default on boot and expect the value to propagate to containers, and programs running in containers may depend on the increased buffer sizes in order to work properly. This means that namespacing the sysctls and using the kernel default values in each new netns would break existing workloads. As a compromise, inherit the initial net.core.*mem_* values from the current process' netns when creating a new netns. This is not standard behavior for most netns sysctls, but it avoids breaking existing workloads. Signed-off-by: Danny Lin <[email protected]> Signed-off-by: NipaLocal <nipa@local>
1 parent e6dd73b commit 2bf3660

File tree

9 files changed

+55
-50
lines changed

9 files changed

+55
-50
lines changed

include/net/netns/core.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@ struct netns_core {
1717
u8 sysctl_txrehash;
1818
u8 sysctl_tstamp_allow_data;
1919

20+
u32 sysctl_wmem_max;
21+
u32 sysctl_rmem_max;
22+
u32 sysctl_wmem_default;
23+
u32 sysctl_rmem_default;
24+
2025
#ifdef CONFIG_PROC_FS
2126
struct prot_inuse __percpu *prot_inuse;
2227
#endif

include/net/sock.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2849,12 +2849,6 @@ void sk_get_meminfo(const struct sock *sk, u32 *meminfo);
28492849
#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
28502850
#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
28512851

2852-
extern __u32 sysctl_wmem_max;
2853-
extern __u32 sysctl_rmem_max;
2854-
2855-
extern __u32 sysctl_wmem_default;
2856-
extern __u32 sysctl_rmem_default;
2857-
28582852
#define SKB_FRAG_PAGE_ORDER get_order(32768)
28592853
DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
28602854

net/core/net_namespace.c

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,27 @@ static __net_init void preinit_net_sysctl(struct net *net)
317317
net->core.sysctl_optmem_max = 128 * 1024;
318318
net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
319319
net->core.sysctl_tstamp_allow_data = 1;
320+
321+
/*
322+
* net.core.{r,w}mem_{default,max} used to be non-namespaced.
323+
* For backward compatibility, inherit values from the current netns
324+
* when creating a new one, so that setting them in init_net
325+
* affects new namespaces like it used to. This avoids causing
326+
* surprising performance regressions for namespaced applications
327+
* relying on tuned rmem/wmem.
328+
*/
329+
if (net == &init_net) {
330+
net->core.sysctl_wmem_max = SK_WMEM_MAX;
331+
net->core.sysctl_rmem_max = SK_RMEM_MAX;
332+
net->core.sysctl_wmem_default = SK_WMEM_MAX;
333+
net->core.sysctl_rmem_default = SK_RMEM_MAX;
334+
} else {
335+
struct net *current_net = current->nsproxy->net_ns;
336+
net->core.sysctl_wmem_max = current_net->core.sysctl_wmem_max;
337+
net->core.sysctl_rmem_max = current_net->core.sysctl_rmem_max;
338+
net->core.sysctl_wmem_default = current_net->core.sysctl_wmem_default;
339+
net->core.sysctl_rmem_default = current_net->core.sysctl_rmem_default;
340+
}
320341
}
321342

322343
/* init code that must occur even if setup_net() is not called. */

net/core/sock.c

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -278,14 +278,6 @@ static struct lock_class_key af_wlock_keys[AF_MAX];
278278
static struct lock_class_key af_elock_keys[AF_MAX];
279279
static struct lock_class_key af_kern_callback_keys[AF_MAX];
280280

281-
/* Run time adjustable parameters. */
282-
__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
283-
EXPORT_SYMBOL(sysctl_wmem_max);
284-
__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
285-
EXPORT_SYMBOL(sysctl_rmem_max);
286-
__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
287-
__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
288-
289281
DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
290282
EXPORT_SYMBOL_GPL(memalloc_socks_key);
291283

@@ -1333,7 +1325,7 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
13331325
* play 'guess the biggest size' games. RCVBUF/SNDBUF
13341326
* are treated in BSD as hints
13351327
*/
1336-
val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1328+
val = min_t(u32, val, READ_ONCE(sock_net(sk)->core.sysctl_wmem_max));
13371329
set_sndbuf:
13381330
/* Ensure val * 2 fits into an int, to prevent max_t()
13391331
* from treating it as a negative value.
@@ -1365,7 +1357,7 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
13651357
* play 'guess the biggest size' games. RCVBUF/SNDBUF
13661358
* are treated in BSD as hints
13671359
*/
1368-
__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1360+
__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sock_net(sk)->core.sysctl_rmem_max)));
13691361
break;
13701362

13711363
case SO_RCVBUFFORCE:
@@ -3618,8 +3610,8 @@ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
36183610
timer_setup(&sk->sk_timer, NULL, 0);
36193611

36203612
sk->sk_allocation = GFP_KERNEL;
3621-
sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default);
3622-
sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
3613+
sk->sk_rcvbuf = READ_ONCE(sock_net(sk)->core.sysctl_rmem_default);
3614+
sk->sk_sndbuf = READ_ONCE(sock_net(sk)->core.sysctl_wmem_default);
36233615
sk->sk_state = TCP_CLOSE;
36243616
sk->sk_use_task_frag = true;
36253617
sk_set_socket(sk, sock);

net/core/sysctl_net_core.c

Lines changed: 4 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -676,45 +676,33 @@ static struct ctl_table netns_core_table[] = {
676676
.extra2 = SYSCTL_ONE,
677677
.proc_handler = proc_dou8vec_minmax,
678678
},
679-
{
680-
.procname = "tstamp_allow_data",
681-
.data = &init_net.core.sysctl_tstamp_allow_data,
682-
.maxlen = sizeof(u8),
683-
.mode = 0644,
684-
.proc_handler = proc_dou8vec_minmax,
685-
.extra1 = SYSCTL_ZERO,
686-
.extra2 = SYSCTL_ONE
687-
},
688-
/* sysctl_core_net_init() will set the values after this
689-
* to readonly in network namespaces
690-
*/
691679
{
692680
.procname = "wmem_max",
693-
.data = &sysctl_wmem_max,
681+
.data = &init_net.core.sysctl_wmem_max,
694682
.maxlen = sizeof(int),
695683
.mode = 0644,
696684
.proc_handler = proc_dointvec_minmax,
697685
.extra1 = &min_sndbuf,
698686
},
699687
{
700688
.procname = "rmem_max",
701-
.data = &sysctl_rmem_max,
689+
.data = &init_net.core.sysctl_rmem_max,
702690
.maxlen = sizeof(int),
703691
.mode = 0644,
704692
.proc_handler = proc_dointvec_minmax,
705693
.extra1 = &min_rcvbuf,
706694
},
707695
{
708696
.procname = "wmem_default",
709-
.data = &sysctl_wmem_default,
697+
.data = &init_net.core.sysctl_wmem_default,
710698
.maxlen = sizeof(int),
711699
.mode = 0644,
712700
.proc_handler = proc_dointvec_minmax,
713701
.extra1 = &min_sndbuf,
714702
},
715703
{
716704
.procname = "rmem_default",
717-
.data = &sysctl_rmem_default,
705+
.data = &init_net.core.sysctl_rmem_default,
718706
.maxlen = sizeof(int),
719707
.mode = 0644,
720708
.proc_handler = proc_dointvec_minmax,
@@ -748,13 +736,8 @@ static __net_init int sysctl_core_net_init(struct net *net)
748736
goto err_dup;
749737

750738
for (i = 0; i < table_size; ++i) {
751-
if (tbl[i].data == &sysctl_wmem_max)
752-
break;
753-
754739
tbl[i].data += (char *)net - (char *)&init_net;
755740
}
756-
for (; i < table_size; ++i)
757-
tbl[i].mode &= ~0222;
758741
}
759742

760743
net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, table_size);

net/ipv4/ip_output.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1643,7 +1643,7 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
16431643

16441644
sk->sk_protocol = ip_hdr(skb)->protocol;
16451645
sk->sk_bound_dev_if = arg->bound_dev_if;
1646-
sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
1646+
sk->sk_sndbuf = READ_ONCE(net->core.sysctl_wmem_default);
16471647
ipc.sockc.mark = fl4.flowi4_mark;
16481648
err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
16491649
len, 0, &ipc, &rt, MSG_DONTWAIT);

net/ipv4/tcp_output.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
241241
if (wscale_ok) {
242242
/* Set window scaling on max possible window */
243243
space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
244-
space = max_t(u32, space, READ_ONCE(sysctl_rmem_max));
244+
space = max_t(u32, space, READ_ONCE(sock_net(sk)->core.sysctl_rmem_max));
245245
space = min_t(u32, space, window_clamp);
246246
*rcv_wscale = clamp_t(int, ilog2(space) - 15,
247247
0, TCP_MAX_WSCALE);

net/netfilter/ipvs/ip_vs_sync.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1280,12 +1280,12 @@ static void set_sock_size(struct sock *sk, int mode, int val)
12801280
lock_sock(sk);
12811281
if (mode) {
12821282
val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2,
1283-
READ_ONCE(sysctl_wmem_max));
1283+
READ_ONCE(sock_net(sk)->core.sysctl_wmem_max));
12841284
sk->sk_sndbuf = val * 2;
12851285
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
12861286
} else {
12871287
val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2,
1288-
READ_ONCE(sysctl_rmem_max));
1288+
READ_ONCE(sock_net(sk)->core.sysctl_rmem_max));
12891289
sk->sk_rcvbuf = val * 2;
12901290
sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
12911291
}

tools/testing/selftests/net/netns-sysctl.sh

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,21 +20,31 @@ fail() {
2020
setup_ns test_ns
2121

2222
for sc in {r,w}mem_{default,max}; do
23-
# check that this is writable in a netns
23+
initial_value="$(sysctl -n "net.core.$sc")"
24+
25+
# check that this is writable in the init netns
2426
[ -w "/proc/sys/net/core/$sc" ] ||
2527
fail "$sc isn't writable in the init netns!"
2628

27-
# change the value in the host netns
29+
# change the value in the init netns
2830
sysctl -qw "net.core.$sc=300000" ||
2931
fail "Can't write $sc in init netns!"
3032

31-
# check that the value is read from the init netns
32-
[ "$(ip netns exec $test_ns sysctl -n "net.core.$sc")" -eq 300000 ] ||
33+
# check that the value did not change in the test netns
34+
[ "$(ip netns exec $test_ns sysctl -n "net.core.$sc")" -eq "$initial_value" ] ||
3335
fail "Value for $sc mismatch!"
3436

35-
# check that this isn't writable in a netns
36-
ip netns exec $test_ns [ -w "/proc/sys/net/core/$sc" ] &&
37-
fail "$sc is writable in a netns!"
37+
# check that this is also writable in the test netns
38+
ip netns exec $test_ns [ -w "/proc/sys/net/core/$sc" ] ||
39+
fail "$sc isn't writable in the test netns!"
40+
41+
# change the value in the test netns
42+
ip netns exec $test_ns sysctl -qw "net.core.$sc=200000" ||
43+
fail "Can't write $sc in test netns!"
44+
45+
# check that the value is read from the test netns
46+
[ "$(ip netns exec $test_ns sysctl -n "net.core.$sc")" -eq 200000 ] ||
47+
fail "Value for $sc mismatch!"
3848
done
3949

4050
echo 'Test passed OK'

0 commit comments

Comments
 (0)