Skip to content

Commit b013840

Browse files
Daniel Borkmanndavem330
authored andcommitted
packet: use percpu mmap tx frame pending refcount
In PF_PACKET's packet mmap(), we can avoid using one atomic_inc() and one atomic_dec() call in skb destructor and use a percpu reference count instead in order to determine if packets are still pending to be sent out. Micro-benchmark with [1] that has been slightly modified (that is, protcol = 0 in socket(2) and bind(2)), example on a rather crappy testing machine; I expect it to scale and have even better results on bigger machines: ./packet_mm_tx -s7000 -m7200 -z700000 em1, avg over 2500 runs: With patch: 4,022,015 cyc Without patch: 4,812,994 cyc time ./packet_mm_tx -s64 -c10000000 em1 > /dev/null, stable: With patch: real 1m32.241s user 0m0.287s sys 1m29.316s Without patch: real 1m38.386s user 0m0.265s sys 1m35.572s In function tpacket_snd(), it is okay to use packet_read_pending() since in fast-path we short-circuit the condition already with ph != NULL, since we have next frames to process. In case we have MSG_DONTWAIT, we also do not execute this path as need_wait is false here anyway, and in case of _no_ MSG_DONTWAIT flag, it is okay to call a packet_read_pending(), because when we ever reach that path, we're done processing outgoing frames anyway and only look if there are skbs still outstanding to be orphaned. We can stay lockless in this percpu counter since it's acceptable when we reach this path for the sum to be imprecise first, but we'll level out at 0 after all pending frames have reached the skb destructor eventually through tx reclaim. When people pin a tx process to particular CPUs, we expect overflows to happen in the reference counter as on one CPU we expect heavy increase; and distributed through ksoftirqd on all CPUs a decrease, for example. As David Laight points out, since the C language doesn't define the result of signed int overflow (i.e. rather than wrap, it is allowed to saturate as a possible outcome), we have to use unsigned int as reference count. The sum over all CPUs when tx is complete will result in 0 again. The BUG_ON() in tpacket_destruct_skb() we can remove as well. It can _only_ be set from inside tpacket_snd() path and we made sure to increase tx_ring.pending in any case before we called po->xmit(skb). So testing for tx_ring.pending == 0 is not too useful. Instead, it would rather have been useful to test if lower layers didn't orphan the skb so that we're missing ring slots being put back to TP_STATUS_AVAILABLE. But such a bug will be caught in user space already as we end up realizing that we do not have any TP_STATUS_AVAILABLE slots left anymore. Therefore, we're all set. Btw, in case of RX_RING path, we do not make use of the pending member, therefore we also don't need to use up any percpu memory here. Also note that __alloc_percpu() already returns a zero-filled percpu area, so initialization is done already. [1] http://wiki.ipxwarzone.com/index.php5?title=Linux_packet_mmap Signed-off-by: Daniel Borkmann <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 87a2fd2 commit b013840

File tree

3 files changed

+62
-7
lines changed

3 files changed

+62
-7
lines changed

net/packet/af_packet.c

Lines changed: 60 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@
8989
#include <linux/errqueue.h>
9090
#include <linux/net_tstamp.h>
9191
#include <linux/reciprocal_div.h>
92+
#include <linux/percpu.h>
9293
#ifdef CONFIG_INET
9394
#include <net/inet_common.h>
9495
#endif
@@ -1168,6 +1169,47 @@ static void packet_increment_head(struct packet_ring_buffer *buff)
11681169
buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
11691170
}
11701171

1172+
static void packet_inc_pending(struct packet_ring_buffer *rb)
1173+
{
1174+
this_cpu_inc(*rb->pending_refcnt);
1175+
}
1176+
1177+
static void packet_dec_pending(struct packet_ring_buffer *rb)
1178+
{
1179+
this_cpu_dec(*rb->pending_refcnt);
1180+
}
1181+
1182+
static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1183+
{
1184+
unsigned int refcnt = 0;
1185+
int cpu;
1186+
1187+
/* We don't use pending refcount in rx_ring. */
1188+
if (rb->pending_refcnt == NULL)
1189+
return 0;
1190+
1191+
for_each_possible_cpu(cpu)
1192+
refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1193+
1194+
return refcnt;
1195+
}
1196+
1197+
static int packet_alloc_pending(struct packet_sock *po)
1198+
{
1199+
po->rx_ring.pending_refcnt = NULL;
1200+
1201+
po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1202+
if (unlikely(po->tx_ring.pending_refcnt == NULL))
1203+
return -ENOBUFS;
1204+
1205+
return 0;
1206+
}
1207+
1208+
static void packet_free_pending(struct packet_sock *po)
1209+
{
1210+
free_percpu(po->tx_ring.pending_refcnt);
1211+
}
1212+
11711213
static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
11721214
{
11731215
struct sock *sk = &po->sk;
@@ -2014,8 +2056,7 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
20142056
__u32 ts;
20152057

20162058
ph = skb_shinfo(skb)->destructor_arg;
2017-
BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
2018-
atomic_dec(&po->tx_ring.pending);
2059+
packet_dec_pending(&po->tx_ring);
20192060

20202061
ts = __packet_set_timestamp(po, ph, skb);
20212062
__packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
@@ -2236,7 +2277,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
22362277
skb_set_queue_mapping(skb, packet_pick_tx_queue(dev));
22372278
skb->destructor = tpacket_destruct_skb;
22382279
__packet_set_status(po, ph, TP_STATUS_SENDING);
2239-
atomic_inc(&po->tx_ring.pending);
2280+
packet_inc_pending(&po->tx_ring);
22402281

22412282
status = TP_STATUS_SEND_REQUEST;
22422283
err = po->xmit(skb);
@@ -2256,8 +2297,14 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
22562297
}
22572298
packet_increment_head(&po->tx_ring);
22582299
len_sum += tp_len;
2259-
} while (likely((ph != NULL) || (need_wait &&
2260-
atomic_read(&po->tx_ring.pending))));
2300+
} while (likely((ph != NULL) ||
2301+
/* Note: packet_read_pending() might be slow if we have
2302+
* to call it as it's per_cpu variable, but in fast-path
2303+
* we already short-circuit the loop with the first
2304+
* condition, and luckily don't have to go that path
2305+
* anyway.
2306+
*/
2307+
(need_wait && packet_read_pending(&po->tx_ring))));
22612308

22622309
err = len_sum;
22632310
goto out_put;
@@ -2556,6 +2603,7 @@ static int packet_release(struct socket *sock)
25562603
/* Purge queues */
25572604

25582605
skb_queue_purge(&sk->sk_receive_queue);
2606+
packet_free_pending(po);
25592607
sk_refcnt_debug_release(sk);
25602608

25612609
sock_put(sk);
@@ -2717,6 +2765,10 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
27172765
po->num = proto;
27182766
po->xmit = dev_queue_xmit;
27192767

2768+
err = packet_alloc_pending(po);
2769+
if (err)
2770+
goto out2;
2771+
27202772
packet_cached_dev_reset(po);
27212773

27222774
sk->sk_destruct = packet_sock_destruct;
@@ -2749,6 +2801,8 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
27492801
preempt_enable();
27502802

27512803
return 0;
2804+
out2:
2805+
sk_free(sk);
27522806
out:
27532807
return err;
27542808
}
@@ -3676,7 +3730,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
36763730
if (!closing) {
36773731
if (atomic_read(&po->mapped))
36783732
goto out;
3679-
if (atomic_read(&rb->pending))
3733+
if (packet_read_pending(rb))
36803734
goto out;
36813735
}
36823736

net/packet/diag.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include <linux/net.h>
44
#include <linux/netdevice.h>
55
#include <linux/packet_diag.h>
6+
#include <linux/percpu.h>
67
#include <net/net_namespace.h>
78
#include <net/sock.h>
89

net/packet/internal.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ struct packet_ring_buffer {
6464
unsigned int pg_vec_pages;
6565
unsigned int pg_vec_len;
6666

67-
atomic_t pending;
67+
unsigned int __percpu *pending_refcnt;
6868

6969
struct tpacket_kbdq_core prb_bdqc;
7070
};

0 commit comments

Comments
 (0)