Skip to content

Commit cb261b5

Browse files
netoptimizerborkmann
authored andcommitted
bpf: Run devmap xdp_prog on flush instead of bulk enqueue
This changes the devmap XDP program support to run the program when the bulk queue is flushed instead of before the frame is enqueued. This has a couple of benefits: - It "sorts" the packets by destination devmap entry, and then runs the same BPF program on all the packets in sequence. This ensures that we keep the XDP program and destination device properties hot in I-cache. - It makes the multicast implementation simpler because it can just enqueue packets using bq_enqueue() without having to deal with the devmap program at all. The drawback is that if the devmap program drops the packet, the enqueue step is redundant. However, arguably this is mostly visible in a micro-benchmark, and with more mixed traffic the I-cache benefit should win out. The performance impact of just this patch is as follows: Using 2 10Gb i40e NIC, redirecting one to another, or into a veth interface, which do XDP_DROP on veth peer. With xdp_redirect_map in sample/bpf, send pkts via pktgen cmd: ./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64 There are about +/- 0.1M deviation for native testing, the performance improved for the base-case, but some drop back with xdp devmap prog attached. Version | Test | Generic | Native | Native + 2nd xdp_prog 5.12 rc4 | xdp_redirect_map i40e->i40e | 1.9M | 9.6M | 8.4M 5.12 rc4 | xdp_redirect_map i40e->veth | 1.7M | 11.7M | 9.8M 5.12 rc4 + patch | xdp_redirect_map i40e->i40e | 1.9M | 9.8M | 8.0M 5.12 rc4 + patch | xdp_redirect_map i40e->veth | 1.7M | 12.0M | 9.4M When bq_xmit_all() is called from bq_enqueue(), another packet will always be enqueued immediately after, so clearing dev_rx, xdp_prog and flush_node in bq_xmit_all() is redundant. Move the clear to __dev_flush(), and only check them once in bq_enqueue() since they are all modified together. This change also has the side effect of extending the lifetime of the RCU-protected xdp_prog that lives inside the devmap entries: Instead of just living for the duration of the XDP program invocation, the reference now lives all the way until the bq is flushed. This is safe because the bq flush happens at the end of the NAPI poll loop, so everything happens between a local_bh_disable()/local_bh_enable() pair. However, this is by no means obvious from looking at the call sites; in particular, some drivers have an additional rcu_read_lock() around only the XDP program invocation, which only confuses matters further. Cleaning this up will be done in a separate patch series. Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Toke Høiland-Jørgensen <toke@redhat.com> Acked-by: John Fastabend <john.fastabend@gmail.com> Link: https://lore.kernel.org/bpf/20210519090747.1655268-2-liuhangbin@gmail.com
1 parent 21703cf commit cb261b5

File tree

1 file changed

+76
-51
lines changed

1 file changed

+76
-51
lines changed

kernel/bpf/devmap.c

+76-51
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ struct xdp_dev_bulk_queue {
5757
struct list_head flush_node;
5858
struct net_device *dev;
5959
struct net_device *dev_rx;
60+
struct bpf_prog *xdp_prog;
6061
unsigned int count;
6162
};
6263

@@ -326,22 +327,71 @@ bool dev_map_can_have_prog(struct bpf_map *map)
326327
return false;
327328
}
328329

330+
static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
331+
struct xdp_frame **frames, int n,
332+
struct net_device *dev)
333+
{
334+
struct xdp_txq_info txq = { .dev = dev };
335+
struct xdp_buff xdp;
336+
int i, nframes = 0;
337+
338+
for (i = 0; i < n; i++) {
339+
struct xdp_frame *xdpf = frames[i];
340+
u32 act;
341+
int err;
342+
343+
xdp_convert_frame_to_buff(xdpf, &xdp);
344+
xdp.txq = &txq;
345+
346+
act = bpf_prog_run_xdp(xdp_prog, &xdp);
347+
switch (act) {
348+
case XDP_PASS:
349+
err = xdp_update_frame_from_buff(&xdp, xdpf);
350+
if (unlikely(err < 0))
351+
xdp_return_frame_rx_napi(xdpf);
352+
else
353+
frames[nframes++] = xdpf;
354+
break;
355+
default:
356+
bpf_warn_invalid_xdp_action(act);
357+
fallthrough;
358+
case XDP_ABORTED:
359+
trace_xdp_exception(dev, xdp_prog, act);
360+
fallthrough;
361+
case XDP_DROP:
362+
xdp_return_frame_rx_napi(xdpf);
363+
break;
364+
}
365+
}
366+
return nframes; /* sent frames count */
367+
}
368+
329369
static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
330370
{
331371
struct net_device *dev = bq->dev;
332-
int sent = 0, err = 0;
372+
int sent = 0, drops = 0, err = 0;
373+
unsigned int cnt = bq->count;
374+
int to_send = cnt;
333375
int i;
334376

335-
if (unlikely(!bq->count))
377+
if (unlikely(!cnt))
336378
return;
337379

338-
for (i = 0; i < bq->count; i++) {
380+
for (i = 0; i < cnt; i++) {
339381
struct xdp_frame *xdpf = bq->q[i];
340382

341383
prefetch(xdpf);
342384
}
343385

344-
sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
386+
if (bq->xdp_prog) {
387+
to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev);
388+
if (!to_send)
389+
goto out;
390+
391+
drops = cnt - to_send;
392+
}
393+
394+
sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags);
345395
if (sent < 0) {
346396
/* If ndo_xdp_xmit fails with an errno, no frames have
347397
* been xmit'ed.
@@ -353,13 +403,13 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
353403
/* If not all frames have been transmitted, it is our
354404
* responsibility to free them
355405
*/
356-
for (i = sent; unlikely(i < bq->count); i++)
406+
for (i = sent; unlikely(i < to_send); i++)
357407
xdp_return_frame_rx_napi(bq->q[i]);
358408

359-
trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, bq->count - sent, err);
360-
bq->dev_rx = NULL;
409+
out:
410+
drops = cnt - sent;
361411
bq->count = 0;
362-
__list_del_clearprev(&bq->flush_node);
412+
trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err);
363413
}
364414

365415
/* __dev_flush is called from xdp_do_flush() which _must_ be signaled
@@ -377,8 +427,12 @@ void __dev_flush(void)
377427
struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
378428
struct xdp_dev_bulk_queue *bq, *tmp;
379429

380-
list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
430+
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
381431
bq_xmit_all(bq, XDP_XMIT_FLUSH);
432+
bq->dev_rx = NULL;
433+
bq->xdp_prog = NULL;
434+
__list_del_clearprev(&bq->flush_node);
435+
}
382436
}
383437

384438
/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
@@ -401,7 +455,7 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
401455
* Thus, safe percpu variable access.
402456
*/
403457
static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
404-
struct net_device *dev_rx)
458+
struct net_device *dev_rx, struct bpf_prog *xdp_prog)
405459
{
406460
struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
407461
struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
@@ -412,18 +466,22 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
412466
/* Ingress dev_rx will be the same for all xdp_frame's in
413467
* bulk_queue, because bq stored per-CPU and must be flushed
414468
* from net_device drivers NAPI func end.
469+
*
470+
* Do the same with xdp_prog and flush_list since these fields
471+
* are only ever modified together.
415472
*/
416-
if (!bq->dev_rx)
473+
if (!bq->dev_rx) {
417474
bq->dev_rx = dev_rx;
475+
bq->xdp_prog = xdp_prog;
476+
list_add(&bq->flush_node, flush_list);
477+
}
418478

419479
bq->q[bq->count++] = xdpf;
420-
421-
if (!bq->flush_node.prev)
422-
list_add(&bq->flush_node, flush_list);
423480
}
424481

425482
static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
426-
struct net_device *dev_rx)
483+
struct net_device *dev_rx,
484+
struct bpf_prog *xdp_prog)
427485
{
428486
struct xdp_frame *xdpf;
429487
int err;
@@ -439,55 +497,22 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
439497
if (unlikely(!xdpf))
440498
return -EOVERFLOW;
441499

442-
bq_enqueue(dev, xdpf, dev_rx);
500+
bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
443501
return 0;
444502
}
445503

446-
static struct xdp_buff *dev_map_run_prog(struct net_device *dev,
447-
struct xdp_buff *xdp,
448-
struct bpf_prog *xdp_prog)
449-
{
450-
struct xdp_txq_info txq = { .dev = dev };
451-
u32 act;
452-
453-
xdp_set_data_meta_invalid(xdp);
454-
xdp->txq = &txq;
455-
456-
act = bpf_prog_run_xdp(xdp_prog, xdp);
457-
switch (act) {
458-
case XDP_PASS:
459-
return xdp;
460-
case XDP_DROP:
461-
break;
462-
default:
463-
bpf_warn_invalid_xdp_action(act);
464-
fallthrough;
465-
case XDP_ABORTED:
466-
trace_xdp_exception(dev, xdp_prog, act);
467-
break;
468-
}
469-
470-
xdp_return_buff(xdp);
471-
return NULL;
472-
}
473-
474504
int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
475505
struct net_device *dev_rx)
476506
{
477-
return __xdp_enqueue(dev, xdp, dev_rx);
507+
return __xdp_enqueue(dev, xdp, dev_rx, NULL);
478508
}
479509

480510
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
481511
struct net_device *dev_rx)
482512
{
483513
struct net_device *dev = dst->dev;
484514

485-
if (dst->xdp_prog) {
486-
xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog);
487-
if (!xdp)
488-
return 0;
489-
}
490-
return __xdp_enqueue(dev, xdp, dev_rx);
515+
return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
491516
}
492517

493518
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,

0 commit comments

Comments
 (0)