mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-09-04 20:19:47 +08:00

Add support for AF_XDP zero-copy transmit path. A new TX buffer type IGB_TYPE_XSK is introduced to indicate that the Tx frame was allocated from the xsk buff pool, so igb_clean_tx_ring() and igb_clean_tx_irq() can clean the buffers correctly based on type. igb_xmit_zc() performs the actual packet transmit when AF_XDP zero-copy is enabled. We share the TX ring between slow path, XDP and AF_XDP zero-copy, so we use the netdev queue lock to ensure mutual exclusion. Signed-off-by: Sriram Yagnaraman <sriram.yagnaraman@est.tech> [Kurt: Set olinfo_status in igb_xmit_zc() so that frames are transmitted, Use READ_ONCE() for xsk_pool and check Tx disabled and carrier in igb_xmit_zc(), Add FIXME for RS bit] Signed-off-by: Kurt Kanzenbach <kurt@linutronix.de> Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com> Tested-by: George Kuruvinakunnel <george.kuruvinakunnel@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com> Link: https://patch.msgid.link/20250106221929.956999-7-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
563 lines
13 KiB
C
563 lines
13 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/* Copyright(c) 2018 Intel Corporation. */
|
|
|
|
#include <linux/bpf_trace.h>
|
|
#include <net/xdp_sock_drv.h>
|
|
#include <net/xdp.h>
|
|
|
|
#include "e1000_hw.h"
|
|
#include "igb.h"
|
|
|
|
static int igb_realloc_rx_buffer_info(struct igb_ring *ring, bool pool_present)
|
|
{
|
|
int size = pool_present ?
|
|
sizeof(*ring->rx_buffer_info_zc) * ring->count :
|
|
sizeof(*ring->rx_buffer_info) * ring->count;
|
|
void *buff_info = vmalloc(size);
|
|
|
|
if (!buff_info)
|
|
return -ENOMEM;
|
|
|
|
if (pool_present) {
|
|
vfree(ring->rx_buffer_info);
|
|
ring->rx_buffer_info = NULL;
|
|
ring->rx_buffer_info_zc = buff_info;
|
|
} else {
|
|
vfree(ring->rx_buffer_info_zc);
|
|
ring->rx_buffer_info_zc = NULL;
|
|
ring->rx_buffer_info = buff_info;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void igb_txrx_ring_disable(struct igb_adapter *adapter, u16 qid)
|
|
{
|
|
struct igb_ring *tx_ring = adapter->tx_ring[qid];
|
|
struct igb_ring *rx_ring = adapter->rx_ring[qid];
|
|
struct e1000_hw *hw = &adapter->hw;
|
|
|
|
set_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags);
|
|
|
|
wr32(E1000_TXDCTL(tx_ring->reg_idx), 0);
|
|
wr32(E1000_RXDCTL(rx_ring->reg_idx), 0);
|
|
|
|
synchronize_net();
|
|
|
|
/* Rx/Tx share the same napi context. */
|
|
napi_disable(&rx_ring->q_vector->napi);
|
|
|
|
igb_clean_tx_ring(tx_ring);
|
|
igb_clean_rx_ring(rx_ring);
|
|
|
|
memset(&rx_ring->rx_stats, 0, sizeof(rx_ring->rx_stats));
|
|
memset(&tx_ring->tx_stats, 0, sizeof(tx_ring->tx_stats));
|
|
}
|
|
|
|
static void igb_txrx_ring_enable(struct igb_adapter *adapter, u16 qid)
|
|
{
|
|
struct igb_ring *tx_ring = adapter->tx_ring[qid];
|
|
struct igb_ring *rx_ring = adapter->rx_ring[qid];
|
|
|
|
igb_configure_tx_ring(adapter, tx_ring);
|
|
igb_configure_rx_ring(adapter, rx_ring);
|
|
|
|
synchronize_net();
|
|
|
|
clear_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags);
|
|
|
|
/* call igb_desc_unused which always leaves
|
|
* at least 1 descriptor unused to make sure
|
|
* next_to_use != next_to_clean
|
|
*/
|
|
if (rx_ring->xsk_pool)
|
|
igb_alloc_rx_buffers_zc(rx_ring, rx_ring->xsk_pool,
|
|
igb_desc_unused(rx_ring));
|
|
else
|
|
igb_alloc_rx_buffers(rx_ring, igb_desc_unused(rx_ring));
|
|
|
|
/* Rx/Tx share the same napi context. */
|
|
napi_enable(&rx_ring->q_vector->napi);
|
|
}
|
|
|
|
struct xsk_buff_pool *igb_xsk_pool(struct igb_adapter *adapter,
|
|
struct igb_ring *ring)
|
|
{
|
|
int qid = ring->queue_index;
|
|
struct xsk_buff_pool *pool;
|
|
|
|
pool = xsk_get_pool_from_qid(adapter->netdev, qid);
|
|
|
|
if (!igb_xdp_is_enabled(adapter))
|
|
return NULL;
|
|
|
|
return (pool && pool->dev) ? pool : NULL;
|
|
}
|
|
|
|
static int igb_xsk_pool_enable(struct igb_adapter *adapter,
|
|
struct xsk_buff_pool *pool,
|
|
u16 qid)
|
|
{
|
|
struct net_device *netdev = adapter->netdev;
|
|
struct igb_ring *rx_ring;
|
|
bool if_running;
|
|
int err;
|
|
|
|
if (qid >= adapter->num_rx_queues)
|
|
return -EINVAL;
|
|
|
|
if (qid >= netdev->real_num_rx_queues ||
|
|
qid >= netdev->real_num_tx_queues)
|
|
return -EINVAL;
|
|
|
|
err = xsk_pool_dma_map(pool, &adapter->pdev->dev, IGB_RX_DMA_ATTR);
|
|
if (err)
|
|
return err;
|
|
|
|
rx_ring = adapter->rx_ring[qid];
|
|
if_running = netif_running(adapter->netdev) && igb_xdp_is_enabled(adapter);
|
|
if (if_running)
|
|
igb_txrx_ring_disable(adapter, qid);
|
|
|
|
if (if_running) {
|
|
err = igb_realloc_rx_buffer_info(rx_ring, true);
|
|
if (!err) {
|
|
igb_txrx_ring_enable(adapter, qid);
|
|
/* Kick start the NAPI context so that receiving will start */
|
|
err = igb_xsk_wakeup(adapter->netdev, qid, XDP_WAKEUP_RX);
|
|
}
|
|
|
|
if (err) {
|
|
xsk_pool_dma_unmap(pool, IGB_RX_DMA_ATTR);
|
|
return err;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int igb_xsk_pool_disable(struct igb_adapter *adapter, u16 qid)
|
|
{
|
|
struct xsk_buff_pool *pool;
|
|
struct igb_ring *rx_ring;
|
|
bool if_running;
|
|
int err;
|
|
|
|
pool = xsk_get_pool_from_qid(adapter->netdev, qid);
|
|
if (!pool)
|
|
return -EINVAL;
|
|
|
|
rx_ring = adapter->rx_ring[qid];
|
|
if_running = netif_running(adapter->netdev) && igb_xdp_is_enabled(adapter);
|
|
if (if_running)
|
|
igb_txrx_ring_disable(adapter, qid);
|
|
|
|
xsk_pool_dma_unmap(pool, IGB_RX_DMA_ATTR);
|
|
|
|
if (if_running) {
|
|
err = igb_realloc_rx_buffer_info(rx_ring, false);
|
|
if (err)
|
|
return err;
|
|
|
|
igb_txrx_ring_enable(adapter, qid);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int igb_xsk_pool_setup(struct igb_adapter *adapter,
|
|
struct xsk_buff_pool *pool,
|
|
u16 qid)
|
|
{
|
|
return pool ? igb_xsk_pool_enable(adapter, pool, qid) :
|
|
igb_xsk_pool_disable(adapter, qid);
|
|
}
|
|
|
|
static u16 igb_fill_rx_descs(struct xsk_buff_pool *pool, struct xdp_buff **xdp,
|
|
union e1000_adv_rx_desc *rx_desc, u16 count)
|
|
{
|
|
dma_addr_t dma;
|
|
u16 buffs;
|
|
int i;
|
|
|
|
/* nothing to do */
|
|
if (!count)
|
|
return 0;
|
|
|
|
buffs = xsk_buff_alloc_batch(pool, xdp, count);
|
|
for (i = 0; i < buffs; i++) {
|
|
dma = xsk_buff_xdp_get_dma(*xdp);
|
|
rx_desc->read.pkt_addr = cpu_to_le64(dma);
|
|
rx_desc->wb.upper.length = 0;
|
|
|
|
rx_desc++;
|
|
xdp++;
|
|
}
|
|
|
|
return buffs;
|
|
}
|
|
|
|
bool igb_alloc_rx_buffers_zc(struct igb_ring *rx_ring,
|
|
struct xsk_buff_pool *xsk_pool, u16 count)
|
|
{
|
|
u32 nb_buffs_extra = 0, nb_buffs = 0;
|
|
union e1000_adv_rx_desc *rx_desc;
|
|
u16 ntu = rx_ring->next_to_use;
|
|
u16 total_count = count;
|
|
struct xdp_buff **xdp;
|
|
|
|
rx_desc = IGB_RX_DESC(rx_ring, ntu);
|
|
xdp = &rx_ring->rx_buffer_info_zc[ntu];
|
|
|
|
if (ntu + count >= rx_ring->count) {
|
|
nb_buffs_extra = igb_fill_rx_descs(xsk_pool, xdp, rx_desc,
|
|
rx_ring->count - ntu);
|
|
if (nb_buffs_extra != rx_ring->count - ntu) {
|
|
ntu += nb_buffs_extra;
|
|
goto exit;
|
|
}
|
|
rx_desc = IGB_RX_DESC(rx_ring, 0);
|
|
xdp = rx_ring->rx_buffer_info_zc;
|
|
ntu = 0;
|
|
count -= nb_buffs_extra;
|
|
}
|
|
|
|
nb_buffs = igb_fill_rx_descs(xsk_pool, xdp, rx_desc, count);
|
|
ntu += nb_buffs;
|
|
if (ntu == rx_ring->count)
|
|
ntu = 0;
|
|
|
|
/* clear the length for the next_to_use descriptor */
|
|
rx_desc = IGB_RX_DESC(rx_ring, ntu);
|
|
rx_desc->wb.upper.length = 0;
|
|
|
|
exit:
|
|
if (rx_ring->next_to_use != ntu) {
|
|
rx_ring->next_to_use = ntu;
|
|
|
|
/* Force memory writes to complete before letting h/w
|
|
* know there are new descriptors to fetch. (Only
|
|
* applicable for weak-ordered memory model archs,
|
|
* such as IA-64).
|
|
*/
|
|
wmb();
|
|
writel(ntu, rx_ring->tail);
|
|
}
|
|
|
|
return total_count == (nb_buffs + nb_buffs_extra);
|
|
}
|
|
|
|
void igb_clean_rx_ring_zc(struct igb_ring *rx_ring)
|
|
{
|
|
u16 ntc = rx_ring->next_to_clean;
|
|
u16 ntu = rx_ring->next_to_use;
|
|
|
|
while (ntc != ntu) {
|
|
struct xdp_buff *xdp = rx_ring->rx_buffer_info_zc[ntc];
|
|
|
|
xsk_buff_free(xdp);
|
|
ntc++;
|
|
if (ntc >= rx_ring->count)
|
|
ntc = 0;
|
|
}
|
|
}
|
|
|
|
static struct sk_buff *igb_construct_skb_zc(struct igb_ring *rx_ring,
|
|
struct xdp_buff *xdp,
|
|
ktime_t timestamp)
|
|
{
|
|
unsigned int totalsize = xdp->data_end - xdp->data_meta;
|
|
unsigned int metasize = xdp->data - xdp->data_meta;
|
|
struct sk_buff *skb;
|
|
|
|
net_prefetch(xdp->data_meta);
|
|
|
|
/* allocate a skb to store the frags */
|
|
skb = napi_alloc_skb(&rx_ring->q_vector->napi, totalsize);
|
|
if (unlikely(!skb))
|
|
return NULL;
|
|
|
|
if (timestamp)
|
|
skb_hwtstamps(skb)->hwtstamp = timestamp;
|
|
|
|
memcpy(__skb_put(skb, totalsize), xdp->data_meta,
|
|
ALIGN(totalsize, sizeof(long)));
|
|
|
|
if (metasize) {
|
|
skb_metadata_set(skb, metasize);
|
|
__skb_pull(skb, metasize);
|
|
}
|
|
|
|
return skb;
|
|
}
|
|
|
|
static int igb_run_xdp_zc(struct igb_adapter *adapter, struct igb_ring *rx_ring,
|
|
struct xdp_buff *xdp, struct xsk_buff_pool *xsk_pool,
|
|
struct bpf_prog *xdp_prog)
|
|
{
|
|
int err, result = IGB_XDP_PASS;
|
|
u32 act;
|
|
|
|
prefetchw(xdp->data_hard_start); /* xdp_frame write */
|
|
|
|
act = bpf_prog_run_xdp(xdp_prog, xdp);
|
|
|
|
if (likely(act == XDP_REDIRECT)) {
|
|
err = xdp_do_redirect(adapter->netdev, xdp, xdp_prog);
|
|
if (!err)
|
|
return IGB_XDP_REDIR;
|
|
|
|
if (xsk_uses_need_wakeup(xsk_pool) &&
|
|
err == -ENOBUFS)
|
|
result = IGB_XDP_EXIT;
|
|
else
|
|
result = IGB_XDP_CONSUMED;
|
|
goto out_failure;
|
|
}
|
|
|
|
switch (act) {
|
|
case XDP_PASS:
|
|
break;
|
|
case XDP_TX:
|
|
result = igb_xdp_xmit_back(adapter, xdp);
|
|
if (result == IGB_XDP_CONSUMED)
|
|
goto out_failure;
|
|
break;
|
|
default:
|
|
bpf_warn_invalid_xdp_action(adapter->netdev, xdp_prog, act);
|
|
fallthrough;
|
|
case XDP_ABORTED:
|
|
out_failure:
|
|
trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
|
|
fallthrough;
|
|
case XDP_DROP:
|
|
result = IGB_XDP_CONSUMED;
|
|
break;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
int igb_clean_rx_irq_zc(struct igb_q_vector *q_vector,
|
|
struct xsk_buff_pool *xsk_pool, const int budget)
|
|
{
|
|
struct igb_adapter *adapter = q_vector->adapter;
|
|
unsigned int total_bytes = 0, total_packets = 0;
|
|
struct igb_ring *rx_ring = q_vector->rx.ring;
|
|
u32 ntc = rx_ring->next_to_clean;
|
|
struct bpf_prog *xdp_prog;
|
|
unsigned int xdp_xmit = 0;
|
|
bool failure = false;
|
|
u16 entries_to_alloc;
|
|
struct sk_buff *skb;
|
|
|
|
/* xdp_prog cannot be NULL in the ZC path */
|
|
xdp_prog = READ_ONCE(rx_ring->xdp_prog);
|
|
|
|
while (likely(total_packets < budget)) {
|
|
union e1000_adv_rx_desc *rx_desc;
|
|
ktime_t timestamp = 0;
|
|
struct xdp_buff *xdp;
|
|
unsigned int size;
|
|
int xdp_res = 0;
|
|
|
|
rx_desc = IGB_RX_DESC(rx_ring, ntc);
|
|
size = le16_to_cpu(rx_desc->wb.upper.length);
|
|
if (!size)
|
|
break;
|
|
|
|
/* This memory barrier is needed to keep us from reading
|
|
* any other fields out of the rx_desc until we know the
|
|
* descriptor has been written back
|
|
*/
|
|
dma_rmb();
|
|
|
|
xdp = rx_ring->rx_buffer_info_zc[ntc];
|
|
xsk_buff_set_size(xdp, size);
|
|
xsk_buff_dma_sync_for_cpu(xdp);
|
|
|
|
/* pull rx packet timestamp if available and valid */
|
|
if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) {
|
|
int ts_hdr_len;
|
|
|
|
ts_hdr_len = igb_ptp_rx_pktstamp(rx_ring->q_vector,
|
|
xdp->data,
|
|
×tamp);
|
|
|
|
xdp->data += ts_hdr_len;
|
|
xdp->data_meta += ts_hdr_len;
|
|
size -= ts_hdr_len;
|
|
}
|
|
|
|
xdp_res = igb_run_xdp_zc(adapter, rx_ring, xdp, xsk_pool,
|
|
xdp_prog);
|
|
|
|
if (xdp_res) {
|
|
if (likely(xdp_res & (IGB_XDP_TX | IGB_XDP_REDIR))) {
|
|
xdp_xmit |= xdp_res;
|
|
} else if (xdp_res == IGB_XDP_EXIT) {
|
|
failure = true;
|
|
break;
|
|
} else if (xdp_res == IGB_XDP_CONSUMED) {
|
|
xsk_buff_free(xdp);
|
|
}
|
|
|
|
total_packets++;
|
|
total_bytes += size;
|
|
ntc++;
|
|
if (ntc == rx_ring->count)
|
|
ntc = 0;
|
|
continue;
|
|
}
|
|
|
|
skb = igb_construct_skb_zc(rx_ring, xdp, timestamp);
|
|
|
|
/* exit if we failed to retrieve a buffer */
|
|
if (!skb) {
|
|
rx_ring->rx_stats.alloc_failed++;
|
|
break;
|
|
}
|
|
|
|
xsk_buff_free(xdp);
|
|
ntc++;
|
|
if (ntc == rx_ring->count)
|
|
ntc = 0;
|
|
|
|
if (eth_skb_pad(skb))
|
|
continue;
|
|
|
|
/* probably a little skewed due to removing CRC */
|
|
total_bytes += skb->len;
|
|
|
|
/* populate checksum, timestamp, VLAN, and protocol */
|
|
igb_process_skb_fields(rx_ring, rx_desc, skb);
|
|
|
|
napi_gro_receive(&q_vector->napi, skb);
|
|
|
|
/* update budget accounting */
|
|
total_packets++;
|
|
}
|
|
|
|
rx_ring->next_to_clean = ntc;
|
|
|
|
if (xdp_xmit)
|
|
igb_finalize_xdp(adapter, xdp_xmit);
|
|
|
|
igb_update_rx_stats(q_vector, total_packets, total_bytes);
|
|
|
|
entries_to_alloc = igb_desc_unused(rx_ring);
|
|
if (entries_to_alloc >= IGB_RX_BUFFER_WRITE)
|
|
failure |= !igb_alloc_rx_buffers_zc(rx_ring, xsk_pool,
|
|
entries_to_alloc);
|
|
|
|
if (xsk_uses_need_wakeup(xsk_pool)) {
|
|
if (failure || rx_ring->next_to_clean == rx_ring->next_to_use)
|
|
xsk_set_rx_need_wakeup(xsk_pool);
|
|
else
|
|
xsk_clear_rx_need_wakeup(xsk_pool);
|
|
|
|
return (int)total_packets;
|
|
}
|
|
return failure ? budget : (int)total_packets;
|
|
}
|
|
|
|
bool igb_xmit_zc(struct igb_ring *tx_ring, struct xsk_buff_pool *xsk_pool)
|
|
{
|
|
unsigned int budget = igb_desc_unused(tx_ring);
|
|
u32 cmd_type, olinfo_status, nb_pkts, i = 0;
|
|
struct xdp_desc *descs = xsk_pool->tx_descs;
|
|
union e1000_adv_tx_desc *tx_desc = NULL;
|
|
struct igb_tx_buffer *tx_buffer_info;
|
|
unsigned int total_bytes = 0;
|
|
dma_addr_t dma;
|
|
|
|
if (!netif_carrier_ok(tx_ring->netdev))
|
|
return true;
|
|
|
|
if (test_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags))
|
|
return true;
|
|
|
|
nb_pkts = xsk_tx_peek_release_desc_batch(xsk_pool, budget);
|
|
if (!nb_pkts)
|
|
return true;
|
|
|
|
while (nb_pkts-- > 0) {
|
|
dma = xsk_buff_raw_get_dma(xsk_pool, descs[i].addr);
|
|
xsk_buff_raw_dma_sync_for_device(xsk_pool, dma, descs[i].len);
|
|
|
|
tx_buffer_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
|
|
tx_buffer_info->bytecount = descs[i].len;
|
|
tx_buffer_info->type = IGB_TYPE_XSK;
|
|
tx_buffer_info->xdpf = NULL;
|
|
tx_buffer_info->gso_segs = 1;
|
|
tx_buffer_info->time_stamp = jiffies;
|
|
|
|
tx_desc = IGB_TX_DESC(tx_ring, tx_ring->next_to_use);
|
|
tx_desc->read.buffer_addr = cpu_to_le64(dma);
|
|
|
|
/* put descriptor type bits */
|
|
cmd_type = E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_DEXT |
|
|
E1000_ADVTXD_DCMD_IFCS;
|
|
olinfo_status = descs[i].len << E1000_ADVTXD_PAYLEN_SHIFT;
|
|
|
|
/* FIXME: This sets the Report Status (RS) bit for every
|
|
* descriptor. One nice to have optimization would be to set it
|
|
* only for the last descriptor in the whole batch. See Intel
|
|
* ice driver for an example on how to do it.
|
|
*/
|
|
cmd_type |= descs[i].len | IGB_TXD_DCMD;
|
|
tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type);
|
|
tx_desc->read.olinfo_status = cpu_to_le32(olinfo_status);
|
|
|
|
total_bytes += descs[i].len;
|
|
|
|
i++;
|
|
tx_ring->next_to_use++;
|
|
tx_buffer_info->next_to_watch = tx_desc;
|
|
if (tx_ring->next_to_use == tx_ring->count)
|
|
tx_ring->next_to_use = 0;
|
|
}
|
|
|
|
netdev_tx_sent_queue(txring_txq(tx_ring), total_bytes);
|
|
igb_xdp_ring_update_tail(tx_ring);
|
|
|
|
return nb_pkts < budget;
|
|
}
|
|
|
|
int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
|
|
{
|
|
struct igb_adapter *adapter = netdev_priv(dev);
|
|
struct e1000_hw *hw = &adapter->hw;
|
|
struct igb_ring *ring;
|
|
u32 eics = 0;
|
|
|
|
if (test_bit(__IGB_DOWN, &adapter->state))
|
|
return -ENETDOWN;
|
|
|
|
if (!igb_xdp_is_enabled(adapter))
|
|
return -EINVAL;
|
|
|
|
if (qid >= adapter->num_tx_queues)
|
|
return -EINVAL;
|
|
|
|
ring = adapter->tx_ring[qid];
|
|
|
|
if (test_bit(IGB_RING_FLAG_TX_DISABLED, &ring->flags))
|
|
return -ENETDOWN;
|
|
|
|
if (!READ_ONCE(ring->xsk_pool))
|
|
return -EINVAL;
|
|
|
|
if (!napi_if_scheduled_mark_missed(&ring->q_vector->napi)) {
|
|
/* Cause software interrupt */
|
|
if (adapter->flags & IGB_FLAG_HAS_MSIX) {
|
|
eics |= ring->q_vector->eims_value;
|
|
wr32(E1000_EICS, eics);
|
|
} else {
|
|
wr32(E1000_ICS, E1000_ICS_RXDMT0);
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|