From 75ab1533d79b046152d424ffe947f79f9d15682c Mon Sep 17 00:00:00 2001 From: Anumula Murali Mohan Reddy Date: Tue, 16 Jul 2024 19:55:32 +0530 Subject: [PATCH 01/99] RDMA/cxgb4: use dma_mmap_coherent() for mapping non-contiguous memory dma_alloc_coherent() allocates contiguous memory irrespective of iommu mode, but after commit f5ff79fddf0e ("dma-mapping: remove CONFIG_DMA_REMAP") if iommu is enabled in translate mode, dma_alloc_coherent() may allocate non-contiguous memory. Attempt to map this memory results in panic. This patch fixes the issue by using dma_mmap_coherent() to map each page to user space. Signed-off-by: Anumula Murali Mohan Reddy Signed-off-by: Potnuri Bharat Teja Link: https://lore.kernel.org/r/20240716142532.97423-1-anumula@chelsio.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/cxgb4/cq.c | 8 ++- drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 36 +++++++++++++ drivers/infiniband/hw/cxgb4/provider.c | 71 +++++++++++++------------- drivers/infiniband/hw/cxgb4/qp.c | 32 ++++++++++-- 4 files changed, 107 insertions(+), 40 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index 5111421f9473..14ced7b667fa 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -1126,13 +1126,19 @@ int c4iw_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, goto err_free_mm2; mm->key = uresp.key; - mm->addr = virt_to_phys(chp->cq.queue); + mm->addr = 0; + mm->vaddr = chp->cq.queue; + mm->dma_addr = chp->cq.dma_addr; mm->len = chp->cq.memsize; + insert_flag_to_mmap(&rhp->rdev, mm, mm->addr); insert_mmap(ucontext, mm); mm2->key = uresp.gts_key; mm2->addr = chp->cq.bar2_pa; mm2->len = PAGE_SIZE; + mm2->vaddr = NULL; + mm2->dma_addr = 0; + insert_flag_to_mmap(&rhp->rdev, mm2, mm2->addr); insert_mmap(ucontext, mm2); } diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index f838bb6718af..bedd5ca96fdd 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -532,11 +532,21 @@ static inline struct c4iw_ucontext *to_c4iw_ucontext(struct ib_ucontext *c) return container_of(c, struct c4iw_ucontext, ibucontext); } +enum { + CXGB4_MMAP_BAR, + CXGB4_MMAP_BAR_WC, + CXGB4_MMAP_CONTIG, + CXGB4_MMAP_NON_CONTIG, +}; + struct c4iw_mm_entry { struct list_head entry; u64 addr; u32 key; + void *vaddr; + dma_addr_t dma_addr; unsigned len; + u8 mmap_flag; }; static inline struct c4iw_mm_entry *remove_mmap(struct c4iw_ucontext *ucontext, @@ -561,6 +571,32 @@ static inline struct c4iw_mm_entry *remove_mmap(struct c4iw_ucontext *ucontext, return NULL; } +static inline void insert_flag_to_mmap(struct c4iw_rdev *rdev, + struct c4iw_mm_entry *mm, u64 addr) +{ + if (addr >= pci_resource_start(rdev->lldi.pdev, 0) && + (addr < (pci_resource_start(rdev->lldi.pdev, 0) + + pci_resource_len(rdev->lldi.pdev, 0)))) + mm->mmap_flag = CXGB4_MMAP_BAR; + else if (addr >= pci_resource_start(rdev->lldi.pdev, 2) && + (addr < (pci_resource_start(rdev->lldi.pdev, 2) + + pci_resource_len(rdev->lldi.pdev, 2)))) { + if (addr >= rdev->oc_mw_pa) { + mm->mmap_flag = CXGB4_MMAP_BAR_WC; + } else { + if (is_t4(rdev->lldi.adapter_type)) + mm->mmap_flag = CXGB4_MMAP_BAR; + else + mm->mmap_flag = CXGB4_MMAP_BAR_WC; + } + } else { + if (addr) + mm->mmap_flag = CXGB4_MMAP_CONTIG; + else + mm->mmap_flag = CXGB4_MMAP_NON_CONTIG; + } +} + static inline void insert_mmap(struct c4iw_ucontext *ucontext, struct c4iw_mm_entry *mm) { diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 246b739ddb2b..10a4c738b59f 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -113,6 +113,9 @@ static int c4iw_alloc_ucontext(struct ib_ucontext *ucontext, mm->key = uresp.status_page_key; mm->addr = virt_to_phys(rhp->rdev.status_page); mm->len = PAGE_SIZE; + mm->vaddr = NULL; + mm->dma_addr = 0; + insert_flag_to_mmap(&rhp->rdev, mm, mm->addr); insert_mmap(context, mm); } return 0; @@ -131,6 +134,11 @@ static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) struct c4iw_mm_entry *mm; struct c4iw_ucontext *ucontext; u64 addr; + u8 mmap_flag; + size_t size; + void *vaddr; + unsigned long vm_pgoff; + dma_addr_t dma_addr; pr_debug("pgoff 0x%lx key 0x%x len %d\n", vma->vm_pgoff, key, len); @@ -145,47 +153,38 @@ static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) if (!mm) return -EINVAL; addr = mm->addr; + vaddr = mm->vaddr; + dma_addr = mm->dma_addr; + size = mm->len; + mmap_flag = mm->mmap_flag; kfree(mm); - if ((addr >= pci_resource_start(rdev->lldi.pdev, 0)) && - (addr < (pci_resource_start(rdev->lldi.pdev, 0) + - pci_resource_len(rdev->lldi.pdev, 0)))) { - - /* - * MA_SYNC register... - */ - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + switch (mmap_flag) { + case CXGB4_MMAP_BAR: + ret = io_remap_pfn_range(vma, vma->vm_start, addr >> PAGE_SHIFT, + len, + pgprot_noncached(vma->vm_page_prot)); + break; + case CXGB4_MMAP_BAR_WC: + ret = io_remap_pfn_range(vma, vma->vm_start, + addr >> PAGE_SHIFT, + len, t4_pgprot_wc(vma->vm_page_prot)); + break; + case CXGB4_MMAP_CONTIG: ret = io_remap_pfn_range(vma, vma->vm_start, addr >> PAGE_SHIFT, len, vma->vm_page_prot); - } else if ((addr >= pci_resource_start(rdev->lldi.pdev, 2)) && - (addr < (pci_resource_start(rdev->lldi.pdev, 2) + - pci_resource_len(rdev->lldi.pdev, 2)))) { - - /* - * Map user DB or OCQP memory... - */ - if (addr >= rdev->oc_mw_pa) - vma->vm_page_prot = t4_pgprot_wc(vma->vm_page_prot); - else { - if (!is_t4(rdev->lldi.adapter_type)) - vma->vm_page_prot = - t4_pgprot_wc(vma->vm_page_prot); - else - vma->vm_page_prot = - pgprot_noncached(vma->vm_page_prot); - } - ret = io_remap_pfn_range(vma, vma->vm_start, - addr >> PAGE_SHIFT, - len, vma->vm_page_prot); - } else { - - /* - * Map WQ or CQ contig dma memory... - */ - ret = remap_pfn_range(vma, vma->vm_start, - addr >> PAGE_SHIFT, - len, vma->vm_page_prot); + break; + case CXGB4_MMAP_NON_CONTIG: + vm_pgoff = vma->vm_pgoff; + vma->vm_pgoff = 0; + ret = dma_mmap_coherent(&rdev->lldi.pdev->dev, vma, + vaddr, dma_addr, size); + vma->vm_pgoff = vm_pgoff; + break; + default: + ret = -EINVAL; + break; } return ret; diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index d16d8eaa1415..7b5c4522b426 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -2281,24 +2281,39 @@ int c4iw_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *attrs, if (ret) goto err_free_ma_sync_key; sq_key_mm->key = uresp.sq_key; - sq_key_mm->addr = qhp->wq.sq.phys_addr; + sq_key_mm->addr = 0; + sq_key_mm->vaddr = qhp->wq.sq.queue; + sq_key_mm->dma_addr = qhp->wq.sq.dma_addr; sq_key_mm->len = PAGE_ALIGN(qhp->wq.sq.memsize); + insert_flag_to_mmap(&rhp->rdev, sq_key_mm, sq_key_mm->addr); insert_mmap(ucontext, sq_key_mm); if (!attrs->srq) { rq_key_mm->key = uresp.rq_key; - rq_key_mm->addr = virt_to_phys(qhp->wq.rq.queue); + rq_key_mm->addr = 0; + rq_key_mm->vaddr = qhp->wq.rq.queue; + rq_key_mm->dma_addr = qhp->wq.rq.dma_addr; rq_key_mm->len = PAGE_ALIGN(qhp->wq.rq.memsize); + insert_flag_to_mmap(&rhp->rdev, rq_key_mm, + rq_key_mm->addr); insert_mmap(ucontext, rq_key_mm); } sq_db_key_mm->key = uresp.sq_db_gts_key; sq_db_key_mm->addr = (u64)(unsigned long)qhp->wq.sq.bar2_pa; + sq_db_key_mm->vaddr = NULL; + sq_db_key_mm->dma_addr = 0; sq_db_key_mm->len = PAGE_SIZE; + insert_flag_to_mmap(&rhp->rdev, sq_db_key_mm, + sq_db_key_mm->addr); insert_mmap(ucontext, sq_db_key_mm); if (!attrs->srq) { rq_db_key_mm->key = uresp.rq_db_gts_key; rq_db_key_mm->addr = (u64)(unsigned long)qhp->wq.rq.bar2_pa; rq_db_key_mm->len = PAGE_SIZE; + rq_db_key_mm->vaddr = NULL; + rq_db_key_mm->dma_addr = 0; + insert_flag_to_mmap(&rhp->rdev, rq_db_key_mm, + rq_db_key_mm->addr); insert_mmap(ucontext, rq_db_key_mm); } if (ma_sync_key_mm) { @@ -2307,6 +2322,10 @@ int c4iw_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *attrs, (pci_resource_start(rhp->rdev.lldi.pdev, 0) + PCIE_MA_SYNC_A) & PAGE_MASK; ma_sync_key_mm->len = PAGE_SIZE; + ma_sync_key_mm->vaddr = NULL; + ma_sync_key_mm->dma_addr = 0; + insert_flag_to_mmap(&rhp->rdev, ma_sync_key_mm, + ma_sync_key_mm->addr); insert_mmap(ucontext, ma_sync_key_mm); } @@ -2761,12 +2780,19 @@ int c4iw_create_srq(struct ib_srq *ib_srq, struct ib_srq_init_attr *attrs, if (ret) goto err_free_srq_db_key_mm; srq_key_mm->key = uresp.srq_key; - srq_key_mm->addr = virt_to_phys(srq->wq.queue); + srq_key_mm->addr = 0; srq_key_mm->len = PAGE_ALIGN(srq->wq.memsize); + srq_key_mm->vaddr = srq->wq.queue; + srq_key_mm->dma_addr = srq->wq.dma_addr; + insert_flag_to_mmap(&rhp->rdev, srq_key_mm, srq_key_mm->addr); insert_mmap(ucontext, srq_key_mm); srq_db_key_mm->key = uresp.srq_db_gts_key; srq_db_key_mm->addr = (u64)(unsigned long)srq->wq.bar2_pa; srq_db_key_mm->len = PAGE_SIZE; + srq_db_key_mm->vaddr = NULL; + srq_db_key_mm->dma_addr = 0; + insert_flag_to_mmap(&rhp->rdev, srq_db_key_mm, + srq_db_key_mm->addr); insert_mmap(ucontext, srq_db_key_mm); } From 60dc7fcafea817f3dcff7ece18095ca6260b73bc Mon Sep 17 00:00:00 2001 From: Showrya M N Date: Wed, 24 Jul 2024 14:24:28 +0530 Subject: [PATCH 02/99] RDMA/siw: Remove NETDEV_GOING_DOWN event handler Toggling link while running NVME-oF over siw hits a kernel panic due to race condition within siw_handler and ib_destroy_qp(). The IB_EVENT_PORT_ERR event can alone handle destroying qps. therefore remove unwanted processing in siw. Suggested-by: Bernard Metzler Signed-off-by: Showrya M N Signed-off-by: Potnuri Bharat Teja Link: https://lore.kernel.org/r/20240724085428.3813-1-showrya@chelsio.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/siw/siw.h | 2 -- drivers/infiniband/sw/siw/siw_main.c | 37 ---------------------------- 2 files changed, 39 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h index 75253f2b3e3d..86d4d6a2170e 100644 --- a/drivers/infiniband/sw/siw/siw.h +++ b/drivers/infiniband/sw/siw/siw.h @@ -94,8 +94,6 @@ struct siw_device { atomic_t num_mr; atomic_t num_srq; atomic_t num_ctx; - - struct work_struct netdev_down; }; struct siw_ucontext { diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c index b2b54242aa69..17abef48abcd 100644 --- a/drivers/infiniband/sw/siw/siw_main.c +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -364,39 +364,6 @@ error: return NULL; } -/* - * Network link becomes unavailable. Mark all - * affected QP's accordingly. - */ -static void siw_netdev_down(struct work_struct *work) -{ - struct siw_device *sdev = - container_of(work, struct siw_device, netdev_down); - - struct siw_qp_attrs qp_attrs; - struct list_head *pos, *tmp; - - memset(&qp_attrs, 0, sizeof(qp_attrs)); - qp_attrs.state = SIW_QP_STATE_ERROR; - - list_for_each_safe(pos, tmp, &sdev->qp_list) { - struct siw_qp *qp = list_entry(pos, struct siw_qp, devq); - - down_write(&qp->state_lock); - WARN_ON(siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE)); - up_write(&qp->state_lock); - } - ib_device_put(&sdev->base_dev); -} - -static void siw_device_goes_down(struct siw_device *sdev) -{ - if (ib_device_try_get(&sdev->base_dev)) { - INIT_WORK(&sdev->netdev_down, siw_netdev_down); - schedule_work(&sdev->netdev_down); - } -} - static int siw_netdev_event(struct notifier_block *nb, unsigned long event, void *arg) { @@ -418,10 +385,6 @@ static int siw_netdev_event(struct notifier_block *nb, unsigned long event, siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE); break; - case NETDEV_GOING_DOWN: - siw_device_goes_down(sdev); - break; - case NETDEV_DOWN: sdev->state = IB_PORT_DOWN; siw_port_event(sdev, 1, IB_EVENT_PORT_ERR); From 2a777679b8ccd09a9a65ea0716ef10365179caac Mon Sep 17 00:00:00 2001 From: Saravanan Vajravel Date: Mon, 22 Jul 2024 16:33:25 +0530 Subject: [PATCH 03/99] RDMA/mad: Improve handling of timed out WRs of mad agent Current timeout handler of mad agent acquires/releases mad_agent_priv lock for every timed out WRs. This causes heavy locking contention when higher no. of WRs are to be handled inside timeout handler. This leads to softlockup with below trace in some use cases where rdma-cm path is used to establish connection between peer nodes Trace: ----- BUG: soft lockup - CPU#4 stuck for 26s! [kworker/u128:3:19767] CPU: 4 PID: 19767 Comm: kworker/u128:3 Kdump: loaded Tainted: G OE ------- --- 5.14.0-427.13.1.el9_4.x86_64 #1 Hardware name: Dell Inc. PowerEdge R740/01YM03, BIOS 2.4.8 11/26/2019 Workqueue: ib_mad1 timeout_sends [ib_core] RIP: 0010:__do_softirq+0x78/0x2ac RSP: 0018:ffffb253449e4f98 EFLAGS: 00000246 RAX: 00000000ffffffff RBX: 0000000000000000 RCX: 000000000000001f RDX: 000000000000001d RSI: 000000003d1879ab RDI: fff363b66fd3a86b RBP: ffffb253604cbcd8 R08: 0000009065635f3b R09: 0000000000000000 R10: 0000000000000040 R11: ffffb253449e4ff8 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000040 FS: 0000000000000000(0000) GS:ffff8caa1fc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fd9ec9db900 CR3: 0000000891934006 CR4: 00000000007706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? show_trace_log_lvl+0x1c4/0x2df ? show_trace_log_lvl+0x1c4/0x2df ? __irq_exit_rcu+0xa1/0xc0 ? watchdog_timer_fn+0x1b2/0x210 ? __pfx_watchdog_timer_fn+0x10/0x10 ? __hrtimer_run_queues+0x127/0x2c0 ? hrtimer_interrupt+0xfc/0x210 ? __sysvec_apic_timer_interrupt+0x5c/0x110 ? sysvec_apic_timer_interrupt+0x37/0x90 ? asm_sysvec_apic_timer_interrupt+0x16/0x20 ? __do_softirq+0x78/0x2ac ? __do_softirq+0x60/0x2ac __irq_exit_rcu+0xa1/0xc0 sysvec_call_function_single+0x72/0x90 asm_sysvec_call_function_single+0x16/0x20 RIP: 0010:_raw_spin_unlock_irq+0x14/0x30 RSP: 0018:ffffb253604cbd88 EFLAGS: 00000247 RAX: 000000000001960d RBX: 0000000000000002 RCX: ffff8cad2a064800 RDX: 000000008020001b RSI: 0000000000000001 RDI: ffff8cad5d39f66c RBP: ffff8cad5d39f600 R08: 0000000000000001 R09: 0000000000000000 R10: ffff8caa443e0c00 R11: ffffb253604cbcd8 R12: ffff8cacb8682538 R13: 0000000000000005 R14: ffffb253604cbd90 R15: ffff8cad5d39f66c cm_process_send_error+0x122/0x1d0 [ib_cm] timeout_sends+0x1dd/0x270 [ib_core] process_one_work+0x1e2/0x3b0 ? __pfx_worker_thread+0x10/0x10 worker_thread+0x50/0x3a0 ? __pfx_worker_thread+0x10/0x10 kthread+0xdd/0x100 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x29/0x50 Simplified timeout handler by creating local list of timed out WRs and invoke send handler post creating the list. The new method acquires/ releases lock once to fetch the list and hence helps to reduce locking contetiong when processing higher no. of WRs Signed-off-by: Saravanan Vajravel Link: https://lore.kernel.org/r/20240722110325.195085-1-saravanan.vajravel@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/mad.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 7439e47ff951..70708fea1296 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -2616,14 +2616,16 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr) static void timeout_sends(struct work_struct *work) { + struct ib_mad_send_wr_private *mad_send_wr, *n; struct ib_mad_agent_private *mad_agent_priv; - struct ib_mad_send_wr_private *mad_send_wr; struct ib_mad_send_wc mad_send_wc; + struct list_head local_list; unsigned long flags, delay; mad_agent_priv = container_of(work, struct ib_mad_agent_private, timed_work.work); mad_send_wc.vendor_err = 0; + INIT_LIST_HEAD(&local_list); spin_lock_irqsave(&mad_agent_priv->lock, flags); while (!list_empty(&mad_agent_priv->wait_list)) { @@ -2641,13 +2643,16 @@ static void timeout_sends(struct work_struct *work) break; } - list_del(&mad_send_wr->agent_list); + list_del_init(&mad_send_wr->agent_list); if (mad_send_wr->status == IB_WC_SUCCESS && !retry_send(mad_send_wr)) continue; - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + list_add_tail(&mad_send_wr->agent_list, &local_list); + } + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + list_for_each_entry_safe(mad_send_wr, n, &local_list, agent_list) { if (mad_send_wr->status == IB_WC_SUCCESS) mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR; else @@ -2655,11 +2660,8 @@ static void timeout_sends(struct work_struct *work) mad_send_wc.send_buf = &mad_send_wr->send_buf; mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); - deref_mad_agent(mad_agent_priv); - spin_lock_irqsave(&mad_agent_priv->lock, flags); } - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); } /* From c772a2c690182410642ead740f7a84b3a7544b2b Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 1 Aug 2024 15:05:10 +0300 Subject: [PATCH 04/99] net/mlx5: Add IFC related stuff for data direct Add IFC related stuff for data direct. Signed-off-by: Yishai Hadas Link: https://patch.msgid.link/82da7f578a567909bb5858a64ba844fe4cc298fa.1722512548.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 51 +++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index cab228cf51c6..970c9d8473ef 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -313,6 +313,7 @@ enum { MLX5_CMD_OP_MODIFY_VHCA_STATE = 0xb0e, MLX5_CMD_OP_SYNC_CRYPTO = 0xb12, MLX5_CMD_OP_ALLOW_OTHER_VHCA_ACCESS = 0xb16, + MLX5_CMD_OPCODE_QUERY_VUID = 0xb22, MLX5_CMD_OP_MAX }; @@ -1885,7 +1886,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_5a0[0x10]; u8 enhanced_cqe_compression[0x1]; - u8 reserved_at_5b1[0x2]; + u8 reserved_at_5b1[0x1]; + u8 crossing_vhca_mkey[0x1]; u8 log_max_dek[0x5]; u8 reserved_at_5b8[0x4]; u8 mini_cqe_resp_stride_index[0x1]; @@ -1954,7 +1956,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 dynamic_msix_table_size[0xc]; u8 reserved_at_740[0xc]; u8 min_dynamic_vf_msix_table_size[0x4]; - u8 reserved_at_750[0x4]; + u8 reserved_at_750[0x2]; + u8 data_direct[0x1]; + u8 reserved_at_753[0x1]; u8 max_dynamic_vf_msix_table_size[0xc]; u8 reserved_at_760[0x3]; @@ -1982,7 +1986,9 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_0[0x80]; u8 migratable[0x1]; - u8 reserved_at_81[0x1f]; + u8 reserved_at_81[0x11]; + u8 query_vuid[0x1]; + u8 reserved_at_93[0xd]; u8 max_reformat_insert_size[0x8]; u8 max_reformat_insert_offset[0x8]; @@ -4154,6 +4160,7 @@ enum { MLX5_MKC_ACCESS_MODE_KSM = 0x3, MLX5_MKC_ACCESS_MODE_SW_ICM = 0x4, MLX5_MKC_ACCESS_MODE_MEMIC = 0x5, + MLX5_MKC_ACCESS_MODE_CROSSING = 0x6, }; struct mlx5_ifc_mkc_bits { @@ -4196,7 +4203,10 @@ struct mlx5_ifc_mkc_bits { u8 bsf_octword_size[0x20]; - u8 reserved_at_120[0x80]; + u8 reserved_at_120[0x60]; + + u8 crossing_target_vhca_id[0x10]; + u8 reserved_at_190[0x10]; u8 translations_octword_size[0x20]; @@ -5124,6 +5134,36 @@ struct mlx5_ifc_query_vport_state_out_bits { u8 state[0x4]; }; +struct mlx5_ifc_array1024_auto_bits { + u8 array1024_auto[32][0x20]; +}; + +struct mlx5_ifc_query_vuid_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x40]; + + u8 query_vfs_vuid[0x1]; + u8 data_direct[0x1]; + u8 reserved_at_62[0xe]; + u8 vhca_id[0x10]; +}; + +struct mlx5_ifc_query_vuid_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x1a0]; + + u8 reserved_at_1e0[0x10]; + u8 num_of_entries[0x10]; + + struct mlx5_ifc_array1024_auto_bits vuid[]; +}; + enum { MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT = 0x0, MLX5_VPORT_STATE_OP_MOD_ESW_VPORT = 0x1, @@ -8989,7 +9029,8 @@ struct mlx5_ifc_create_mkey_in_bits { u8 pg_access[0x1]; u8 mkey_umem_valid[0x1]; - u8 reserved_at_62[0x1e]; + u8 data_direct[0x1]; + u8 reserved_at_63[0x1d]; struct mlx5_ifc_mkc_bits memory_key_mkey_entry; From df6d27a30970158466b632c82da09a9b24c30f4b Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Tue, 30 Jul 2024 12:17:25 +0300 Subject: [PATCH 05/99] RDMA/nldev: Enhance netlink message parsing and validation Use strict parsing validation for set commands, and liberal validation for get commands. Additionally, remove all usage of nlmsg_parse_depricate(). Strict parsing validation fails when encountering unrecognized attributes in the Netlink message, while liberal parsing validation ignores them. In 57d7a8fd904c ("rdma: Add an option to display driver-specific QPs in the rdma tool") in iproute2, the attribute RDMA_NLDEV_ATTR_DRIVER_DETAILS was added. This cause backwards compatibility issues when using the rdma tool with the new attribute and an older kernel which does recognize this attribute. In this case, the command "rdma stat show mr" would fail, because the new rdma tool would fill the netlink message with the new attribute and the older kernel would fail as it used strict parsing and did not recognize the new attribute. In general, strict validation is appropriate for set commands as they modify the system, while liberal validation is suitable for get commands which only query system information. Replace all uses of nlmsg_parse_deprecated() with __nlmsg_parse(), using the NL_VALIDATE_LIBERAL flag. The nlmsg_parse_deprecated() function internally calls __nlmsg_parse() with the NL_VALIDATE_LIBERAL flag, but its name is confusing. Signed-off-by: Chiara Meiohas Reviewed-by: Michael Guralnik Link: https://lore.kernel.org/r/f633a979a49db090d05c24a3ba83d30727bb777b.1722331020.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 56 ++++++++++++++++----------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index a6b80cdc96f7..4d4a1f90e484 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -1074,8 +1074,8 @@ static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, u32 index; int err; - err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; @@ -1123,8 +1123,8 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, u32 index; int err; - err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; @@ -1215,8 +1215,8 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, u32 port; int err; - err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) @@ -1275,8 +1275,8 @@ static int nldev_port_get_dumpit(struct sk_buff *skb, int err; unsigned int p; - err = nlmsg_parse_deprecated(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, NULL); + err = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, NULL); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; @@ -1331,8 +1331,8 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, u32 index; int ret; - ret = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; @@ -1481,8 +1481,8 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct sk_buff *msg; int ret; - ret = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !fe->id || !tb[fe->id]) return -EINVAL; @@ -1569,8 +1569,8 @@ static int res_get_common_dumpit(struct sk_buff *skb, u32 index, port = 0; bool filled = false; - err = nlmsg_parse_deprecated(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, NULL); + err = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, NULL); /* * Right now, we are expecting the device index to get res information, * but it is possible to extend this code to return all devices in @@ -1762,8 +1762,8 @@ static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, char type[IFNAMSIZ]; int err; - err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] || !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME]) return -EINVAL; @@ -1806,8 +1806,8 @@ static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, u32 index; int err; - err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; @@ -1836,8 +1836,8 @@ static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh, u32 index; int err; - err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, - extack); + err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, + NL_VALIDATE_LIBERAL, extack); if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE]) return -EINVAL; @@ -1920,8 +1920,8 @@ static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct sk_buff *msg; int err; - err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); if (err) return err; @@ -2420,8 +2420,8 @@ static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; int ret; - ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret) return -EINVAL; @@ -2450,8 +2450,8 @@ static int nldev_stat_get_dumpit(struct sk_buff *skb, struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; int ret; - ret = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, NULL); + ret = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, NULL); if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES]) return -EINVAL; @@ -2482,8 +2482,8 @@ static int nldev_stat_get_counter_status_doit(struct sk_buff *skb, u32 devid, port; int ret, i; - ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) return -EINVAL; From 0ea4ffb2bc80e8f4e6ca87e07142d1c17285ae95 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 30 Jul 2024 12:18:45 +0300 Subject: [PATCH 06/99] RDMA/mlx5: Expose vhca id for all ports in multiport mode In multiport mode, RDMA devices make it impossible for userspace to use DEVX to discover vhca id values for ports beyond port 1. This patch addresses the issue by exposing the vhca id of all ports. Signed-off-by: Mark Bloch Reviewed-by: Maor Gottlieb Link: https://patch.msgid.link/41dea83aa51843aa4c067b4f73f28d64e51bd53c.1722331101.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/std_types.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/std_types.c b/drivers/infiniband/hw/mlx5/std_types.c index bbfcce3bdc84..5c83765a85e0 100644 --- a/drivers/infiniband/hw/mlx5/std_types.c +++ b/drivers/infiniband/hw/mlx5/std_types.c @@ -111,6 +111,23 @@ out: return err; } +static int fill_multiport_info(struct mlx5_ib_dev *dev, u32 port_num, + struct mlx5_ib_uapi_query_port *info) +{ + struct mlx5_core_dev *mdev; + + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, NULL); + if (!mdev) + return -EINVAL; + + info->vport_vhca_id = MLX5_CAP_GEN(mdev, vhca_id); + info->flags |= MLX5_IB_UAPI_QUERY_PORT_VPORT_VHCA_ID; + + mlx5_ib_put_native_port_mdev(dev, port_num); + + return 0; +} + static int fill_switchdev_info(struct mlx5_ib_dev *dev, u32 port_num, struct mlx5_ib_uapi_query_port *info) { @@ -177,6 +194,10 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_QUERY_PORT)( ret = fill_switchdev_info(dev, port_num, &info); if (ret) return ret; + } else if (mlx5_core_mp_enabled(dev->mdev)) { + ret = fill_multiport_info(dev, port_num, &info); + if (ret) + return ret; } return uverbs_copy_to_struct_or_zero(attrs, MLX5_IB_ATTR_QUERY_PORT, &info, From 6910e3660d86c1a5654f742a40181d2c9154f26f Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 1 Aug 2024 15:05:11 +0300 Subject: [PATCH 07/99] RDMA/mlx5: Introduce the 'data direct' driver Introduce the 'data direct' driver for a ConnectX-8 Data Direct device. The 'data direct' driver functions as the affiliated DMA device for one or more capable mlx5_ib devices. This DMA device, as the name suggests, is used exclusively for DMA operations. It can be considered a DMA engine managed by a PF/VF, lacking network capabilities and having minimal overall capabilities. Consequently, the DMA NIC PF will not be exposed to or directly used by software applications. The driver will not have any direct interface or interaction with the firmware (no command interface, no capabilities, etc.). It will operate solely over PCI to enable its DMA functionality. Registration and un-registration of the driver are handled as part of the mlx5_ib initialization and exit processes, as the mlx5_ib devices will effectively be its clients. The driver will serve as the DMA device for accessing another PCI device to achieve optimal performance (both on the same NUMA node, P2P access, etc.). Upon probing, it will read its VUID over PCI to handle mlx5_ib device registrations with the same VUID. Upon removal, it will notify its clients to allow them to clean up the resources that were mmaped with its DMA device. Signed-off-by: Yishai Hadas Link: https://patch.msgid.link/b77edecfd476c3f445da96ab6aef499ae47b2829.1722512548.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/Makefile | 1 + drivers/infiniband/hw/mlx5/data_direct.c | 227 +++++++++++++++++++++++ drivers/infiniband/hw/mlx5/data_direct.h | 23 +++ drivers/infiniband/hw/mlx5/main.c | 24 +++ drivers/infiniband/hw/mlx5/mlx5_ib.h | 6 + 5 files changed, 281 insertions(+) create mode 100644 drivers/infiniband/hw/mlx5/data_direct.c create mode 100644 drivers/infiniband/hw/mlx5/data_direct.h diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index 72a526236c2e..b38961f5058e 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -6,6 +6,7 @@ mlx5_ib-y := ah.o \ cong.o \ counters.o \ cq.o \ + data_direct.o \ dm.o \ doorbell.o \ gsi.o \ diff --git a/drivers/infiniband/hw/mlx5/data_direct.c b/drivers/infiniband/hw/mlx5/data_direct.c new file mode 100644 index 000000000000..b9ba84afaae2 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/data_direct.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include "mlx5_ib.h" +#include "data_direct.h" + +static LIST_HEAD(mlx5_data_direct_dev_list); +static LIST_HEAD(mlx5_data_direct_reg_list); + +/* + * This mutex should be held when accessing either of the above lists + */ +static DEFINE_MUTEX(mlx5_data_direct_mutex); + +struct mlx5_data_direct_registration { + struct mlx5_ib_dev *ibdev; + char vuid[MLX5_ST_SZ_BYTES(array1024_auto) + 1]; + struct list_head list; +}; + +static const struct pci_device_id mlx5_data_direct_pci_table[] = { + { PCI_VDEVICE(MELLANOX, 0x2100) }, /* ConnectX-8 Data Direct */ + { 0, } +}; + +static int mlx5_data_direct_vpd_get_vuid(struct mlx5_data_direct_dev *dev) +{ + struct pci_dev *pdev = dev->pdev; + unsigned int vpd_size, kw_len; + u8 *vpd_data; + int start; + int ret; + + vpd_data = pci_vpd_alloc(pdev, &vpd_size); + if (IS_ERR(vpd_data)) { + pci_err(pdev, "Unable to read VPD, err=%ld\n", PTR_ERR(vpd_data)); + return PTR_ERR(vpd_data); + } + + start = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size, "VU", &kw_len); + if (start < 0) { + ret = start; + pci_err(pdev, "VU keyword not found, err=%d\n", ret); + goto end; + } + + dev->vuid = kmemdup_nul(vpd_data + start, kw_len, GFP_KERNEL); + ret = dev->vuid ? 0 : -ENOMEM; + +end: + kfree(vpd_data); + return ret; +} + +static void mlx5_data_direct_shutdown(struct pci_dev *pdev) +{ + pci_disable_device(pdev); +} + +static int mlx5_data_direct_set_dma_caps(struct pci_dev *pdev) +{ + int err; + + err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (err) { + dev_warn(&pdev->dev, + "Warning: couldn't set 64-bit PCI DMA mask, err=%d\n", err); + err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, "Can't set PCI DMA mask, err=%d\n", err); + return err; + } + } + + dma_set_max_seg_size(&pdev->dev, SZ_2G); + return 0; +} + +int mlx5_data_direct_ib_reg(struct mlx5_ib_dev *ibdev, char *vuid) +{ + struct mlx5_data_direct_registration *reg; + struct mlx5_data_direct_dev *dev; + + reg = kzalloc(sizeof(*reg), GFP_KERNEL); + if (!reg) + return -ENOMEM; + + reg->ibdev = ibdev; + strcpy(reg->vuid, vuid); + + mutex_lock(&mlx5_data_direct_mutex); + list_for_each_entry(dev, &mlx5_data_direct_dev_list, list) { + if (strcmp(dev->vuid, vuid) == 0) { + mlx5_ib_data_direct_bind(ibdev, dev); + break; + } + } + + /* Add the registration to its global list, to be used upon bind/unbind + * of its affiliated data direct device + */ + list_add_tail(®->list, &mlx5_data_direct_reg_list); + mutex_unlock(&mlx5_data_direct_mutex); + return 0; +} + +void mlx5_data_direct_ib_unreg(struct mlx5_ib_dev *ibdev) +{ + struct mlx5_data_direct_registration *reg; + + mutex_lock(&mlx5_data_direct_mutex); + list_for_each_entry(reg, &mlx5_data_direct_reg_list, list) { + if (reg->ibdev == ibdev) { + list_del(®->list); + kfree(reg); + goto end; + } + } + + WARN_ON(true); +end: + mutex_unlock(&mlx5_data_direct_mutex); +} + +static void mlx5_data_direct_dev_reg(struct mlx5_data_direct_dev *dev) +{ + struct mlx5_data_direct_registration *reg; + + mutex_lock(&mlx5_data_direct_mutex); + list_for_each_entry(reg, &mlx5_data_direct_reg_list, list) { + if (strcmp(dev->vuid, reg->vuid) == 0) + mlx5_ib_data_direct_bind(reg->ibdev, dev); + } + + /* Add the data direct device to the global list, further IB devices may + * use it later as well + */ + list_add_tail(&dev->list, &mlx5_data_direct_dev_list); + mutex_unlock(&mlx5_data_direct_mutex); +} + +static void mlx5_data_direct_dev_unreg(struct mlx5_data_direct_dev *dev) +{ + struct mlx5_data_direct_registration *reg; + + mutex_lock(&mlx5_data_direct_mutex); + /* Prevent any further affiliations */ + list_del(&dev->list); + list_for_each_entry(reg, &mlx5_data_direct_reg_list, list) { + if (strcmp(dev->vuid, reg->vuid) == 0) + mlx5_ib_data_direct_unbind(reg->ibdev); + } + mutex_unlock(&mlx5_data_direct_mutex); +} + +static int mlx5_data_direct_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct mlx5_data_direct_dev *dev; + int err; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + + dev->device = &pdev->dev; + dev->pdev = pdev; + + pci_set_drvdata(dev->pdev, dev); + err = pci_enable_device(pdev); + if (err) { + dev_err(dev->device, "Cannot enable PCI device, err=%d\n", err); + goto err; + } + + pci_set_master(pdev); + err = mlx5_data_direct_set_dma_caps(pdev); + if (err) + goto err_disable; + + if (pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP32) && + pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP64) && + pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP128)) + dev_dbg(dev->device, "Enabling pci atomics failed\n"); + + err = mlx5_data_direct_vpd_get_vuid(dev); + if (err) + goto err_disable; + + mlx5_data_direct_dev_reg(dev); + return 0; + +err_disable: + pci_disable_device(pdev); +err: + kfree(dev); + return err; +} + +static void mlx5_data_direct_remove(struct pci_dev *pdev) +{ + struct mlx5_data_direct_dev *dev = pci_get_drvdata(pdev); + + mlx5_data_direct_dev_unreg(dev); + pci_disable_device(pdev); + kfree(dev->vuid); + kfree(dev); +} + +static struct pci_driver mlx5_data_direct_driver = { + .name = KBUILD_MODNAME, + .id_table = mlx5_data_direct_pci_table, + .probe = mlx5_data_direct_probe, + .remove = mlx5_data_direct_remove, + .shutdown = mlx5_data_direct_shutdown, +}; + +int mlx5_data_direct_driver_register(void) +{ + return pci_register_driver(&mlx5_data_direct_driver); +} + +void mlx5_data_direct_driver_unregister(void) +{ + pci_unregister_driver(&mlx5_data_direct_driver); +} diff --git a/drivers/infiniband/hw/mlx5/data_direct.h b/drivers/infiniband/hw/mlx5/data_direct.h new file mode 100644 index 000000000000..2fd2bdbe8f69 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/data_direct.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#ifndef _MLX5_IB_DATA_DIRECT_H +#define _MLX5_IB_DATA_DIRECT_H + +struct mlx5_ib_dev; + +struct mlx5_data_direct_dev { + struct device *device; + struct pci_dev *pdev; + char *vuid; + struct list_head list; +}; + +int mlx5_data_direct_ib_reg(struct mlx5_ib_dev *ibdev, char *vuid); +void mlx5_data_direct_ib_unreg(struct mlx5_ib_dev *ibdev); +int mlx5_data_direct_driver_register(void); +void mlx5_data_direct_driver_unregister(void); + +#endif diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 6048b9ad13bb..de254cf03173 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -48,6 +48,7 @@ #include #include #include "macsec.h" +#include "data_direct.h" #define UVERBS_MODULE_NAME mlx5_ib #include @@ -3866,6 +3867,7 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) dev->ib_dev.num_comp_vectors = mlx5_comp_vectors_max(mdev); mutex_init(&dev->cap_mask_mutex); + mutex_init(&dev->data_direct_lock); INIT_LIST_HEAD(&dev->qp_list); spin_lock_init(&dev->reset_flow_resource_lock); xa_init(&dev->odp_mkeys); @@ -4293,6 +4295,21 @@ static void mlx5_ib_stage_dev_notifier_cleanup(struct mlx5_ib_dev *dev) mlx5_notifier_unregister(dev->mdev, &dev->mdev_events); } +void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev, + struct mlx5_data_direct_dev *dev) +{ + mutex_lock(&ibdev->data_direct_lock); + ibdev->data_direct_dev = dev; + mutex_unlock(&ibdev->data_direct_lock); +} + +void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev) +{ + mutex_lock(&ibdev->data_direct_lock); + ibdev->data_direct_dev = NULL; + mutex_unlock(&ibdev->data_direct_lock); +} + void __mlx5_ib_remove(struct mlx5_ib_dev *dev, const struct mlx5_ib_profile *profile, int stage) @@ -4715,17 +4732,23 @@ static int __init mlx5_ib_init(void) ret = mlx5r_rep_init(); if (ret) goto rep_err; + ret = mlx5_data_direct_driver_register(); + if (ret) + goto dd_err; ret = auxiliary_driver_register(&mlx5r_mp_driver); if (ret) goto mp_err; ret = auxiliary_driver_register(&mlx5r_driver); if (ret) goto drv_err; + return 0; drv_err: auxiliary_driver_unregister(&mlx5r_mp_driver); mp_err: + mlx5_data_direct_driver_unregister(); +dd_err: mlx5r_rep_cleanup(); rep_err: mlx5_ib_qp_event_cleanup(); @@ -4737,6 +4760,7 @@ qp_event_err: static void __exit mlx5_ib_cleanup(void) { + mlx5_data_direct_driver_unregister(); auxiliary_driver_unregister(&mlx5r_driver); auxiliary_driver_unregister(&mlx5r_mp_driver); mlx5r_rep_cleanup(); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index d5eb1b726675..b0d7d8b9e672 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1131,6 +1131,9 @@ struct mlx5_macsec { struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; + struct mlx5_data_direct_dev *data_direct_dev; + /* protect accessing data_direct_dev */ + struct mutex data_direct_lock; struct notifier_block mdev_events; int num_ports; /* serialize update of capability mask @@ -1425,6 +1428,9 @@ int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, struct ib_dm_mr_attr *attr, struct uverbs_attr_bundle *attrs); +void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev, + struct mlx5_data_direct_dev *dev); +void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev); #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); From 2e8e631d7a41e3a4edc94f3c9dd5cb32c2aa539e Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 1 Aug 2024 15:05:12 +0300 Subject: [PATCH 08/99] RDMA/mlx5: Add the initialization flow to utilize the 'data direct' device Add the NET device initialization flow to utilize the 'data direct' device. When a NET mlx5_ib device is capable of 'data direct', the following sequence of actions will occur: - Find its affiliated 'data direct' VUID via a firmware command. - Create its own private PD and 'data direct' mkey. - Register to be notified when its 'data direct' driver is probed or removed. The DMA device of the affiliated 'data direct' device, including the private PD and the 'data direct' mkey, will be used later during MR registrations that request the data direct functionality. Signed-off-by: Yishai Hadas Link: https://patch.msgid.link/b11fa87b2a65bce4db8d40341bb6cee490fa4d06.1722512548.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/cmd.c | 21 +++++++ drivers/infiniband/hw/mlx5/cmd.h | 2 + drivers/infiniband/hw/mlx5/main.c | 90 ++++++++++++++++++++++++++++ drivers/infiniband/hw/mlx5/mlx5_ib.h | 6 ++ 4 files changed, 119 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index 895b62cc528d..7c08e3008927 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -245,3 +245,24 @@ int mlx5_cmd_uar_dealloc(struct mlx5_core_dev *dev, u32 uarn, u16 uid) MLX5_SET(dealloc_uar_in, in, uid, uid); return mlx5_cmd_exec_in(dev, dealloc_uar, in); } + +int mlx5_cmd_query_vuid(struct mlx5_core_dev *dev, bool data_direct, + char *out_vuid) +{ + u8 out[MLX5_ST_SZ_BYTES(query_vuid_out) + + MLX5_ST_SZ_BYTES(array1024_auto)] = {}; + u8 in[MLX5_ST_SZ_BYTES(query_vuid_in)] = {}; + char *vuid; + int err; + + MLX5_SET(query_vuid_in, in, opcode, MLX5_CMD_OPCODE_QUERY_VUID); + MLX5_SET(query_vuid_in, in, vhca_id, MLX5_CAP_GEN(dev, vhca_id)); + MLX5_SET(query_vuid_in, in, data_direct, data_direct); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + vuid = MLX5_ADDR_OF(query_vuid_out, out, vuid); + memcpy(out_vuid, vuid, MLX5_ST_SZ_BYTES(array1024_auto)); + return 0; +} diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h index e5cd31270443..e6c88b6ebd0d 100644 --- a/drivers/infiniband/hw/mlx5/cmd.h +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -58,4 +58,6 @@ int mlx5_cmd_mad_ifc(struct mlx5_ib_dev *dev, const void *inb, void *outb, u16 opmod, u8 port); int mlx5_cmd_uar_alloc(struct mlx5_core_dev *dev, u32 *uarn, u16 uid); int mlx5_cmd_uar_dealloc(struct mlx5_core_dev *dev, u32 uarn, u16 uid); +int mlx5_cmd_query_vuid(struct mlx5_core_dev *dev, bool data_direct, + char *out_vuid); #endif /* MLX5_IB_CMD_H */ diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index de254cf03173..fc0562f07249 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3025,6 +3025,59 @@ static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev) mutex_destroy(&devr->srq_lock); } +static int +mlx5_ib_create_data_direct_resources(struct mlx5_ib_dev *dev) +{ + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + struct mlx5_core_dev *mdev = dev->mdev; + void *mkc; + u32 mkey; + u32 pdn; + u32 *in; + int err; + + err = mlx5_core_alloc_pd(mdev, &pdn); + if (err) + return err; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err; + } + + MLX5_SET(create_mkey_in, in, data_direct, 1); + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, rw, 1); + MLX5_SET(mkc, mkc, rr, 1); + MLX5_SET(mkc, mkc, a, 1); + MLX5_SET(mkc, mkc, pd, pdn); + MLX5_SET(mkc, mkc, length64, 1); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + err = mlx5_core_create_mkey(mdev, &mkey, in, inlen); + kvfree(in); + if (err) + goto err; + + dev->ddr.mkey = mkey; + dev->ddr.pdn = pdn; + return 0; + +err: + mlx5_core_dealloc_pd(mdev, pdn); + return err; +} + +static void +mlx5_ib_free_data_direct_resources(struct mlx5_ib_dev *dev) +{ + mlx5_core_destroy_mkey(dev->mdev, dev->ddr.mkey); + mlx5_core_dealloc_pd(dev->mdev, dev->ddr.pdn); +} + static u32 get_core_cap_flags(struct ib_device *ibdev, struct mlx5_hca_vport_context *rep) { @@ -3421,6 +3474,38 @@ unbind: return false; } +static int mlx5_ib_data_direct_init(struct mlx5_ib_dev *dev) +{ + char vuid[MLX5_ST_SZ_BYTES(array1024_auto) + 1] = {}; + int ret; + + if (!MLX5_CAP_GEN(dev->mdev, data_direct)) + return 0; + + ret = mlx5_cmd_query_vuid(dev->mdev, true, vuid); + if (ret) + return ret; + + ret = mlx5_ib_create_data_direct_resources(dev); + if (ret) + return ret; + + ret = mlx5_data_direct_ib_reg(dev, vuid); + if (ret) + mlx5_ib_free_data_direct_resources(dev); + + return ret; +} + +static void mlx5_ib_data_direct_cleanup(struct mlx5_ib_dev *dev) +{ + if (!MLX5_CAP_GEN(dev->mdev, data_direct)) + return; + + mlx5_data_direct_ib_unreg(dev); + mlx5_ib_free_data_direct_resources(dev); +} + static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev) { u32 port_num = mlx5_core_native_port_num(dev->mdev) - 1; @@ -3814,6 +3899,7 @@ static const struct uapi_definition mlx5_ib_defs[] = { static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) { + mlx5_ib_data_direct_cleanup(dev); mlx5_ib_cleanup_multiport_master(dev); WARN_ON(!xa_empty(&dev->odp_mkeys)); mutex_destroy(&dev->cap_mask_mutex); @@ -3876,6 +3962,10 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) spin_lock_init(&dev->dm.lock); dev->dm.dev = mdev; + err = mlx5_ib_data_direct_init(dev); + if (err) + goto err_mp; + return 0; err_mp: mlx5_ib_cleanup_multiport_master(dev); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index b0d7d8b9e672..b2ebea173547 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -835,6 +835,11 @@ struct mlx5_ib_port_resources { struct work_struct pkey_change_work; }; +struct mlx5_data_direct_resources { + u32 pdn; + u32 mkey; +}; + struct mlx5_ib_resources { struct ib_cq *c0; struct mutex cq_lock; @@ -1188,6 +1193,7 @@ struct mlx5_ib_dev { u16 pkey_table_len; u8 lag_ports; struct mlx5_special_mkeys mkeys; + struct mlx5_data_direct_resources ddr; #ifdef CONFIG_MLX5_MACSEC struct mlx5_macsec macsec; From 682358fd35dece838e6ae2d9d6a69fc0b9a9d411 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 1 Aug 2024 15:05:13 +0300 Subject: [PATCH 09/99] RDMA/umem: Add support for creating pinned DMABUF umem with a given dma device Add support for creating pinned DMABUF umem with a specified DMA device instead of the DMA device of the given IB device. This API will be utilized in the upcoming patches of the series when multiple path DMAs are implemented. Signed-off-by: Yishai Hadas Link: https://patch.msgid.link/038aad36a43797e5591b20ba81051fc5758124f9.1722512548.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/umem_dmabuf.c | 45 ++++++++++++++++++++------- include/rdma/ib_umem.h | 15 +++++++++ 2 files changed, 49 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c index 39357dc2d229..726a09786547 100644 --- a/drivers/infiniband/core/umem_dmabuf.c +++ b/drivers/infiniband/core/umem_dmabuf.c @@ -110,10 +110,12 @@ void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) } EXPORT_SYMBOL(ib_umem_dmabuf_unmap_pages); -struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device, - unsigned long offset, size_t size, - int fd, int access, - const struct dma_buf_attach_ops *ops) +static struct ib_umem_dmabuf * +ib_umem_dmabuf_get_with_dma_device(struct ib_device *device, + struct device *dma_device, + unsigned long offset, size_t size, + int fd, int access, + const struct dma_buf_attach_ops *ops) { struct dma_buf *dmabuf; struct ib_umem_dmabuf *umem_dmabuf; @@ -152,7 +154,7 @@ struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device, umem_dmabuf->attach = dma_buf_dynamic_attach( dmabuf, - device->dma_device, + dma_device, ops, umem_dmabuf); if (IS_ERR(umem_dmabuf->attach)) { @@ -168,6 +170,15 @@ out_release_dmabuf: dma_buf_put(dmabuf); return ret; } + +struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device, + unsigned long offset, size_t size, + int fd, int access, + const struct dma_buf_attach_ops *ops) +{ + return ib_umem_dmabuf_get_with_dma_device(device, device->dma_device, + offset, size, fd, access, ops); +} EXPORT_SYMBOL(ib_umem_dmabuf_get); static void @@ -184,16 +195,18 @@ static struct dma_buf_attach_ops ib_umem_dmabuf_attach_pinned_ops = { .move_notify = ib_umem_dmabuf_unsupported_move_notify, }; -struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device, - unsigned long offset, - size_t size, int fd, - int access) +struct ib_umem_dmabuf * +ib_umem_dmabuf_get_pinned_with_dma_device(struct ib_device *device, + struct device *dma_device, + unsigned long offset, size_t size, + int fd, int access) { struct ib_umem_dmabuf *umem_dmabuf; int err; - umem_dmabuf = ib_umem_dmabuf_get(device, offset, size, fd, access, - &ib_umem_dmabuf_attach_pinned_ops); + umem_dmabuf = ib_umem_dmabuf_get_with_dma_device(device, dma_device, offset, + size, fd, access, + &ib_umem_dmabuf_attach_pinned_ops); if (IS_ERR(umem_dmabuf)) return umem_dmabuf; @@ -217,6 +230,16 @@ err_release: ib_umem_release(&umem_dmabuf->umem); return ERR_PTR(err); } +EXPORT_SYMBOL(ib_umem_dmabuf_get_pinned_with_dma_device); + +struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device, + unsigned long offset, + size_t size, int fd, + int access) +{ + return ib_umem_dmabuf_get_pinned_with_dma_device(device, device->dma_device, + offset, size, fd, access); +} EXPORT_SYMBOL(ib_umem_dmabuf_get_pinned); void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 565a85044541..de05268ed632 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -150,6 +150,11 @@ struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device, unsigned long offset, size_t size, int fd, int access); +struct ib_umem_dmabuf * +ib_umem_dmabuf_get_pinned_with_dma_device(struct ib_device *device, + struct device *dma_device, + unsigned long offset, size_t size, + int fd, int access); int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf); void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf); void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf); @@ -196,6 +201,16 @@ ib_umem_dmabuf_get_pinned(struct ib_device *device, unsigned long offset, { return ERR_PTR(-EOPNOTSUPP); } + +static inline struct ib_umem_dmabuf * +ib_umem_dmabuf_get_pinned_with_dma_device(struct ib_device *device, + struct device *dma_device, + unsigned long offset, size_t size, + int fd, int access) +{ + return ERR_PTR(-EOPNOTSUPP); +} + static inline int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf) { return -EOPNOTSUPP; From 253c61dc256b3e6be65657f78b4a8452163ce00f Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 1 Aug 2024 15:05:14 +0300 Subject: [PATCH 10/99] RDMA/umem: Introduce an option to revoke DMABUF umem Introduce an option to revoke DMABUF umem. This option will retain the umem allocation while revoking its DMA mapping. Furthermore, any subsequent attempts to map the pages should fail once the umem has been revoked. This functionality will be utilized in the upcoming patches in the series, where we aim to delay umem deallocation until the mkey deregistration. However, we must unmap its pages immediately. Signed-off-by: Yishai Hadas Link: https://patch.msgid.link/a38270f2fe4a194868ca2312f4c1c760e51bcbff.1722512548.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/umem_dmabuf.c | 21 +++++++++++++++++++-- include/rdma/ib_umem.h | 3 +++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c index 726a09786547..9fcd37761264 100644 --- a/drivers/infiniband/core/umem_dmabuf.c +++ b/drivers/infiniband/core/umem_dmabuf.c @@ -23,6 +23,9 @@ int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf) dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); + if (umem_dmabuf->revoked) + return -EINVAL; + if (umem_dmabuf->sgt) goto wait_fence; @@ -242,15 +245,29 @@ struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device, } EXPORT_SYMBOL(ib_umem_dmabuf_get_pinned); -void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) +void ib_umem_dmabuf_revoke(struct ib_umem_dmabuf *umem_dmabuf) { struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf; dma_resv_lock(dmabuf->resv, NULL); + if (umem_dmabuf->revoked) + goto end; ib_umem_dmabuf_unmap_pages(umem_dmabuf); - if (umem_dmabuf->pinned) + if (umem_dmabuf->pinned) { dma_buf_unpin(umem_dmabuf->attach); + umem_dmabuf->pinned = 0; + } + umem_dmabuf->revoked = 1; +end: dma_resv_unlock(dmabuf->resv); +} +EXPORT_SYMBOL(ib_umem_dmabuf_revoke); + +void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) +{ + struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf; + + ib_umem_dmabuf_revoke(umem_dmabuf); dma_buf_detach(dmabuf, umem_dmabuf->attach); dma_buf_put(dmabuf); diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index de05268ed632..7dc7b1cc71b5 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -38,6 +38,7 @@ struct ib_umem_dmabuf { unsigned long last_sg_trim; void *private; u8 pinned : 1; + u8 revoked : 1; }; static inline struct ib_umem_dmabuf *to_ib_umem_dmabuf(struct ib_umem *umem) @@ -158,6 +159,7 @@ ib_umem_dmabuf_get_pinned_with_dma_device(struct ib_device *device, int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf); void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf); void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf); +void ib_umem_dmabuf_revoke(struct ib_umem_dmabuf *umem_dmabuf); #else /* CONFIG_INFINIBAND_USER_MEM */ @@ -217,6 +219,7 @@ static inline int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf) } static inline void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) { } static inline void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) { } +static inline void ib_umem_dmabuf_revoke(struct ib_umem_dmabuf *umem_dmabuf) {} #endif /* CONFIG_INFINIBAND_USER_MEM */ #endif /* IB_UMEM_H */ From 3aa73c6b795b9aaaf933f3c95495d85fc0de39e3 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 1 Aug 2024 15:05:15 +0300 Subject: [PATCH 11/99] RDMA: Pass uverbs_attr_bundle as part of '.reg_user_mr_dmabuf' API Pass uverbs_attr_bundle as part of '.reg_user_mr_dmabuf' API instead of udata. This enables passing some new ioctl attributes to the drivers, as will be introduced in the next patches for mlx5 driver. Change the involved drivers accordingly. Signed-off-by: Yishai Hadas Link: https://patch.msgid.link/9a25b2fc02443f7c36c2d93499ae25252b6afd40.1722512548.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/uverbs_std_types_mr.c | 2 +- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 3 ++- drivers/infiniband/hw/bnxt_re/ib_verbs.h | 2 +- drivers/infiniband/hw/efa/efa.h | 2 +- drivers/infiniband/hw/efa/efa_verbs.c | 4 ++-- drivers/infiniband/hw/irdma/verbs.c | 2 +- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +- drivers/infiniband/hw/mlx5/mr.c | 2 +- include/rdma/ib_verbs.h | 2 +- 9 files changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c index 03e1db5d1e8c..7ebc7bd3caae 100644 --- a/drivers/infiniband/core/uverbs_std_types_mr.c +++ b/drivers/infiniband/core/uverbs_std_types_mr.c @@ -239,7 +239,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)( mr = pd->device->ops.reg_user_mr_dmabuf(pd, offset, length, iova, fd, access_flags, - &attrs->driver_udata); + attrs); if (IS_ERR(mr)) return PTR_ERR(mr); diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 7c757351a016..43a68e7de02a 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -4122,7 +4122,8 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, struct ib_mr *bnxt_re_reg_user_mr_dmabuf(struct ib_pd *ib_pd, u64 start, u64 length, u64 virt_addr, int fd, - int mr_access_flags, struct ib_udata *udata) + int mr_access_flags, + struct uverbs_attr_bundle *attrs) { struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); struct bnxt_re_dev *rdev = pd->rdev; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index e98cb1717338..3ddeda312376 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -242,7 +242,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct ib_mr *bnxt_re_reg_user_mr_dmabuf(struct ib_pd *ib_pd, u64 start, u64 length, u64 virt_addr, int fd, int mr_access_flags, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata); void bnxt_re_dealloc_ucontext(struct ib_ucontext *context); int bnxt_re_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h index e580e087e9da..d7fc9d5eeefd 100644 --- a/drivers/infiniband/hw/efa/efa.h +++ b/drivers/infiniband/hw/efa/efa.h @@ -168,7 +168,7 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 length, u64 virt_addr, int fd, int access_flags, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); int efa_get_port_immutable(struct ib_device *ibdev, u32 port_num, struct ib_port_immutable *immutable); diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index b1e0a1b7c59d..cc13415ff7e7 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1684,14 +1684,14 @@ static int efa_register_mr(struct ib_pd *ibpd, struct efa_mr *mr, u64 start, struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 length, u64 virt_addr, int fd, int access_flags, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { struct efa_dev *dev = to_edev(ibpd->device); struct ib_umem_dmabuf *umem_dmabuf; struct efa_mr *mr; int err; - mr = efa_alloc_mr(ibpd, access_flags, udata); + mr = efa_alloc_mr(ibpd, access_flags, &attrs->driver_udata); if (IS_ERR(mr)) { err = PTR_ERR(mr); goto err_out; diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index fc0ce35da14e..6a107decb704 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -3085,7 +3085,7 @@ error: static struct ib_mr *irdma_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start, u64 len, u64 virt, int fd, int access, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { struct irdma_device *iwdev = to_iwdev(pd->device); struct ib_umem_dmabuf *umem_dmabuf; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index b2ebea173547..e915a62da49c 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1354,7 +1354,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int fd, int access_flags, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int mlx5_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, u32 flags, diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 98bd8eaa393e..1dfd9124bdd1 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1513,7 +1513,7 @@ static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, u64 length, u64 virt_addr, int fd, int access_flags, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_mr *mr = NULL; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 6c5712ae559d..a1dcf812d787 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2476,7 +2476,7 @@ struct ib_device_ops { struct ib_mr *(*reg_user_mr_dmabuf)(struct ib_pd *pd, u64 offset, u64 length, u64 virt_addr, int fd, int mr_access_flags, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); struct ib_mr *(*rereg_user_mr)(struct ib_mr *mr, int flags, u64 start, u64 length, u64 virt_addr, int mr_access_flags, struct ib_pd *pd, From de8f847a5114ff7cfcdfc114af8485c431dec703 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 1 Aug 2024 15:05:16 +0300 Subject: [PATCH 12/99] RDMA/mlx5: Add support for DMABUF MR registrations with Data-direct Add support for DMABUF MR registrations with Data-direct device. Upon userspace calling to register a DMABUF MR with the data direct bit set, the below algorithm will be followed. 1) Obtain a pinned DMABUF umem from the IB core using the user input parameters (FD, offset, length) and the DMA PF device. The DMA PF device is needed to allow the IOMMU to enable the DMA PF to access the user buffer over PCI. 2) Create a KSM MKEY by setting its entries according to the user buffer VA to IOVA mapping, with the MKEY being the data direct device-crossed MKEY. This KSM MKEY is umrable and will be used as part of the MR cache. The PD for creating it is the internal device 'data direct' kernel one. 3) Create a crossing MKEY that points to the KSM MKEY using the crossing access mode. 4) Manage the KSM MKEY by adding it to a list of 'data direct' MKEYs managed on the mlx5_ib device. 5) Return the crossing MKEY to the user, created with its supplied PD. Upon DMA PF unbind flow, the driver will revoke the KSM entries. The final deregistration will occur under the hood once the application deregisters its MKEY. Notes: - This version supports only the PINNED UMEM mode, so there is no dependency on ODP. - The IOVA supplied by the application must be system page aligned due to HW translations of KSM. - The crossing MKEY will not be umrable or part of the MR cache, as we cannot change its crossed (i.e. KSM) MKEY over UMR. Signed-off-by: Yishai Hadas Link: https://patch.msgid.link/1f99d8020ed540d9702b9e2252a145a439609ba6.1722512548.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 11 + drivers/infiniband/hw/mlx5/mlx5_ib.h | 8 + drivers/infiniband/hw/mlx5/mr.c | 304 +++++++++++++++++++--- drivers/infiniband/hw/mlx5/odp.c | 5 +- drivers/infiniband/hw/mlx5/umr.c | 93 ++++--- drivers/infiniband/hw/mlx5/umr.h | 1 + include/uapi/rdma/mlx5_user_ioctl_cmds.h | 4 + include/uapi/rdma/mlx5_user_ioctl_verbs.h | 4 + 8 files changed, 358 insertions(+), 72 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index fc0562f07249..b85ad3c0bfa1 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3490,6 +3490,7 @@ static int mlx5_ib_data_direct_init(struct mlx5_ib_dev *dev) if (ret) return ret; + INIT_LIST_HEAD(&dev->data_direct_mr_list); ret = mlx5_data_direct_ib_reg(dev, vuid); if (ret) mlx5_ib_free_data_direct_resources(dev); @@ -3882,6 +3883,14 @@ ADD_UVERBS_ATTRIBUTES_SIMPLE( dump_fill_mkey), UA_MANDATORY)); +ADD_UVERBS_ATTRIBUTES_SIMPLE( + mlx5_ib_reg_dmabuf_mr, + UVERBS_OBJECT_MR, + UVERBS_METHOD_REG_DMABUF_MR, + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS, + enum mlx5_ib_uapi_reg_dmabuf_flags, + UA_OPTIONAL)); + static const struct uapi_definition mlx5_ib_defs[] = { UAPI_DEF_CHAIN(mlx5_ib_devx_defs), UAPI_DEF_CHAIN(mlx5_ib_flow_defs), @@ -3891,6 +3900,7 @@ static const struct uapi_definition mlx5_ib_defs[] = { UAPI_DEF_CHAIN(mlx5_ib_create_cq_defs), UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DEVICE, &mlx5_ib_query_context), + UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_MR, &mlx5_ib_reg_dmabuf_mr), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR, UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_UAR), @@ -4396,6 +4406,7 @@ void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev, void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev) { mutex_lock(&ibdev->data_direct_lock); + mlx5_ib_revoke_data_direct_mrs(ibdev); ibdev->data_direct_dev = NULL; mutex_unlock(&ibdev->data_direct_lock); } diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index e915a62da49c..be83a4d91a34 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -682,6 +682,8 @@ struct mlx5_ib_mr { struct mlx5_ib_mkey mmkey; struct ib_umem *umem; + /* The mr is data direct related */ + u8 data_direct :1; union { /* Used only by kernel MRs (umem == NULL) */ @@ -719,6 +721,10 @@ struct mlx5_ib_mr { } odp_destroy; struct ib_odp_counters odp_stats; bool is_odp_implicit; + /* The affilated data direct crossed mr */ + struct mlx5_ib_mr *dd_crossed_mr; + struct list_head dd_node; + u8 revoked :1; }; }; }; @@ -1169,6 +1175,7 @@ struct mlx5_ib_dev { /* protect resources needed as part of reset flow */ spinlock_t reset_flow_resource_lock; struct list_head qp_list; + struct list_head data_direct_mr_list; /* Array with num_ports elements */ struct mlx5_ib_port *port; struct mlx5_sq_bfreg bfreg; @@ -1437,6 +1444,7 @@ struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev, struct mlx5_data_direct_dev *dev); void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev); +void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev); #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 1dfd9124bdd1..6829e3688b60 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -43,6 +43,7 @@ #include "dm.h" #include "mlx5_ib.h" #include "umr.h" +#include "data_direct.h" enum { MAX_PENDING_REG_MR = 8, @@ -54,7 +55,9 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context); static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, u64 iova, int access_flags, - unsigned int page_size, bool populate); + unsigned int page_size, bool populate, + int access_mode); +static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr); static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, struct ib_pd *pd) @@ -1126,12 +1129,10 @@ static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem, static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, struct ib_umem *umem, u64 iova, - int access_flags) + int access_flags, int access_mode) { - struct mlx5r_cache_rb_key rb_key = { - .access_mode = MLX5_MKC_ACCESS_MODE_MTT, - }; struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5r_cache_rb_key rb_key = {}; struct mlx5_cache_ent *ent; struct mlx5_ib_mr *mr; unsigned int page_size; @@ -1144,6 +1145,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, if (WARN_ON(!page_size)) return ERR_PTR(-EINVAL); + rb_key.access_mode = access_mode; rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size); rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags); rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags); @@ -1154,7 +1156,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, */ if (!ent) { mutex_lock(&dev->slow_path_mutex); - mr = reg_create(pd, umem, iova, access_flags, page_size, false); + mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode); mutex_unlock(&dev->slow_path_mutex); if (IS_ERR(mr)) return mr; @@ -1175,13 +1177,71 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, return mr; } +static struct ib_mr * +reg_create_crossing_vhca_mr(struct ib_pd *pd, u64 iova, u64 length, int access_flags, + u32 crossed_lkey) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + int access_mode = MLX5_MKC_ACCESS_MODE_CROSSING; + struct mlx5_ib_mr *mr; + void *mkc; + int inlen; + u32 *in; + int err; + + if (!MLX5_CAP_GEN(dev->mdev, crossing_vhca_mkey)) + return ERR_PTR(-EOPNOTSUPP); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_1; + } + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, crossing_target_vhca_id, + MLX5_CAP_GEN(dev->mdev, vhca_id)); + MLX5_SET(mkc, mkc, translations_octword_size, crossed_lkey); + MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); + MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); + + /* for this crossing mkey IOVA should be 0 and len should be IOVA + len */ + set_mkc_access_pd_addr_fields(mkc, access_flags, 0, pd); + MLX5_SET64(mkc, mkc, len, iova + length); + + MLX5_SET(mkc, mkc, free, 0); + MLX5_SET(mkc, mkc, umr_en, 0); + err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); + if (err) + goto err_2; + + mr->mmkey.type = MLX5_MKEY_MR; + set_mr_fields(dev, mr, length, access_flags, iova); + mr->ibmr.pd = pd; + kvfree(in); + mlx5_ib_dbg(dev, "crossing mkey = 0x%x\n", mr->mmkey.key); + + return &mr->ibmr; +err_2: + kvfree(in); +err_1: + kfree(mr); + return ERR_PTR(err); +} + /* * If ibmr is NULL it will be allocated by reg_create. * Else, the given ibmr will be used. */ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, u64 iova, int access_flags, - unsigned int page_size, bool populate) + unsigned int page_size, bool populate, + int access_mode) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_mr *mr; @@ -1190,7 +1250,9 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, int inlen; u32 *in; int err; - bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); + bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) && + (access_mode == MLX5_MKC_ACCESS_MODE_MTT); + bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); if (!page_size) return ERR_PTR(-EINVAL); @@ -1213,7 +1275,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, } pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); if (populate) { - if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) { + if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND || ksm_mode)) { err = -EINVAL; goto err_2; } @@ -1229,14 +1291,22 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); set_mkc_access_pd_addr_fields(mkc, access_flags, iova, populate ? pd : dev->umrc.pd); + /* In case a data direct flow, overwrite the pdn field by its internal kernel PD */ + if (umem->is_dmabuf && ksm_mode) + MLX5_SET(mkc, mkc, pd, dev->ddr.pdn); + MLX5_SET(mkc, mkc, free, !populate); - MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); + MLX5_SET(mkc, mkc, access_mode_1_0, access_mode); MLX5_SET(mkc, mkc, umr_en, 1); MLX5_SET64(mkc, mkc, len, umem->length); MLX5_SET(mkc, mkc, bsf_octword_size, 0); - MLX5_SET(mkc, mkc, translations_octword_size, - get_octo_len(iova, umem->length, mr->page_shift)); + if (ksm_mode) + MLX5_SET(mkc, mkc, translations_octword_size, + get_octo_len(iova, umem->length, mr->page_shift) * 2); + else + MLX5_SET(mkc, mkc, translations_octword_size, + get_octo_len(iova, umem->length, mr->page_shift)); MLX5_SET(mkc, mkc, log_page_size, mr->page_shift); if (mlx5_umem_needs_ats(dev, umem, access_flags)) MLX5_SET(mkc, mkc, ma_translation_mode, 1); @@ -1373,13 +1443,15 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length); if (xlt_with_umr) { - mr = alloc_cacheable_mr(pd, umem, iova, access_flags); + mr = alloc_cacheable_mr(pd, umem, iova, access_flags, + MLX5_MKC_ACCESS_MODE_MTT); } else { unsigned int page_size = mlx5_umem_find_best_pgsz( umem, mkc, log_page_size, 0, iova); mutex_lock(&dev->slow_path_mutex); - mr = reg_create(pd, umem, iova, access_flags, page_size, true); + mr = reg_create(pd, umem, iova, access_flags, page_size, + true, MLX5_MKC_ACCESS_MODE_MTT); mutex_unlock(&dev->slow_path_mutex); } if (IS_ERR(mr)) { @@ -1442,7 +1514,8 @@ static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, if (IS_ERR(odp)) return ERR_CAST(odp); - mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags); + mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags, + MLX5_MKC_ACCESS_MODE_MTT); if (IS_ERR(mr)) { ib_umem_release(&odp->umem); return ERR_CAST(mr); @@ -1510,35 +1583,31 @@ static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { .move_notify = mlx5_ib_dmabuf_invalidate_cb, }; -struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, - u64 length, u64 virt_addr, - int fd, int access_flags, - struct uverbs_attr_bundle *attrs) +static struct ib_mr * +reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device, + u64 offset, u64 length, u64 virt_addr, + int fd, int access_flags, int access_mode) { + bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_mr *mr = NULL; struct ib_umem_dmabuf *umem_dmabuf; int err; - if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || - !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) - return ERR_PTR(-EOPNOTSUPP); - - mlx5_ib_dbg(dev, - "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n", - offset, virt_addr, length, fd, access_flags); - err = mlx5r_umr_resource_init(dev); if (err) return ERR_PTR(err); - /* dmabuf requires xlt update via umr to work. */ - if (!mlx5r_umr_can_load_pas(dev, length)) - return ERR_PTR(-EINVAL); + if (!pinned_mode) + umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, + offset, length, fd, + access_flags, + &mlx5_ib_dmabuf_attach_ops); + else + umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(&dev->ib_dev, + dma_device, offset, length, + fd, access_flags); - umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, - access_flags, - &mlx5_ib_dmabuf_attach_ops); if (IS_ERR(umem_dmabuf)) { mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n", PTR_ERR(umem_dmabuf)); @@ -1546,7 +1615,7 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, } mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr, - access_flags); + access_flags, access_mode); if (IS_ERR(mr)) { ib_umem_release(&umem_dmabuf->umem); return ERR_CAST(mr); @@ -1556,9 +1625,13 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages); umem_dmabuf->private = mr; - err = mlx5r_store_odp_mkey(dev, &mr->mmkey); - if (err) - goto err_dereg_mr; + if (!pinned_mode) { + err = mlx5r_store_odp_mkey(dev, &mr->mmkey); + if (err) + goto err_dereg_mr; + } else { + mr->data_direct = true; + } err = mlx5_ib_init_dmabuf_mr(mr); if (err) @@ -1566,10 +1639,101 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, return &mr->ibmr; err_dereg_mr: - mlx5_ib_dereg_mr(&mr->ibmr, NULL); + __mlx5_ib_dereg_mr(&mr->ibmr); return ERR_PTR(err); } +static struct ib_mr * +reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset, + u64 length, u64 virt_addr, + int fd, int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_data_direct_dev *data_direct_dev; + struct ib_mr *crossing_mr; + struct ib_mr *crossed_mr; + int ret = 0; + + /* As of HW behaviour the IOVA must be page aligned in KSM mode */ + if (!PAGE_ALIGNED(virt_addr) || (access_flags & IB_ACCESS_ON_DEMAND)) + return ERR_PTR(-EOPNOTSUPP); + + mutex_lock(&dev->data_direct_lock); + data_direct_dev = dev->data_direct_dev; + if (!data_direct_dev) { + ret = -EINVAL; + goto end; + } + + /* The device's 'data direct mkey' was created without RO flags to + * simplify things and allow for a single mkey per device. + * Since RO is not a must, mask it out accordingly. + */ + access_flags &= ~IB_ACCESS_RELAXED_ORDERING; + crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev, + offset, length, virt_addr, fd, + access_flags, MLX5_MKC_ACCESS_MODE_KSM); + if (IS_ERR(crossed_mr)) { + ret = PTR_ERR(crossed_mr); + goto end; + } + + mutex_lock(&dev->slow_path_mutex); + crossing_mr = reg_create_crossing_vhca_mr(pd, virt_addr, length, access_flags, + crossed_mr->lkey); + mutex_unlock(&dev->slow_path_mutex); + if (IS_ERR(crossing_mr)) { + __mlx5_ib_dereg_mr(crossed_mr); + ret = PTR_ERR(crossing_mr); + goto end; + } + + list_add_tail(&to_mmr(crossed_mr)->dd_node, &dev->data_direct_mr_list); + to_mmr(crossing_mr)->dd_crossed_mr = to_mmr(crossed_mr); + to_mmr(crossing_mr)->data_direct = true; +end: + mutex_unlock(&dev->data_direct_lock); + return ret ? ERR_PTR(ret) : crossing_mr; +} + +struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, + u64 length, u64 virt_addr, + int fd, int access_flags, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + int mlx5_access_flags = 0; + int err; + + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || + !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) + return ERR_PTR(-EOPNOTSUPP); + + if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) { + err = uverbs_get_flags32(&mlx5_access_flags, attrs, + MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS, + MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT); + if (err) + return ERR_PTR(err); + } + + mlx5_ib_dbg(dev, + "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x, mlx5_access_flags 0x%x\n", + offset, virt_addr, length, fd, access_flags, mlx5_access_flags); + + /* dmabuf requires xlt update via umr to work. */ + if (!mlx5r_umr_can_load_pas(dev, length)) + return ERR_PTR(-EINVAL); + + if (mlx5_access_flags & MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT) + return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr, + fd, access_flags); + + return reg_user_mr_dmabuf(pd, pd->device->dma_device, + offset, length, virt_addr, + fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT); +} + /* * True if the change in access flags can be done via UMR, only some access * flags can be updated. @@ -1665,7 +1829,7 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, struct mlx5_ib_mr *mr = to_mmr(ib_mr); int err; - if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct) return ERR_PTR(-EOPNOTSUPP); mlx5_ib_dbg( @@ -1793,7 +1957,7 @@ err: static void mlx5_free_priv_descs(struct mlx5_ib_mr *mr) { - if (!mr->umem && mr->descs) { + if (!mr->umem && !mr->data_direct && mr->descs) { struct ib_device *device = mr->ibmr.device; int size = mr->max_descs * mr->desc_size; struct mlx5_ib_dev *dev = to_mdev(device); @@ -1847,6 +2011,34 @@ end: return ret; } +static int mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr *mr) +{ + struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); + struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem); + int err; + + lockdep_assert_held(&dev->data_direct_lock); + mr->revoked = true; + err = mlx5r_umr_revoke_mr(mr); + if (WARN_ON(err)) + return err; + + ib_umem_dmabuf_revoke(umem_dmabuf); + return 0; +} + +void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev) +{ + struct mlx5_ib_mr *mr, *next; + + lockdep_assert_held(&dev->data_direct_lock); + + list_for_each_entry_safe(mr, next, &dev->data_direct_mr_list, dd_node) { + list_del(&mr->dd_node); + mlx5_ib_revoke_data_direct_mr(mr); + } +} + static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) { struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); @@ -1864,7 +2056,7 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) return destroy_mkey(dev, mr); } -int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr) { struct mlx5_ib_mr *mr = to_mmr(ibmr); struct mlx5_ib_dev *dev = to_mdev(ibmr->device); @@ -1931,6 +2123,36 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) return 0; } +static int dereg_crossing_data_direct_mr(struct mlx5_ib_dev *dev, + struct mlx5_ib_mr *mr) +{ + struct mlx5_ib_mr *dd_crossed_mr = mr->dd_crossed_mr; + int ret; + + ret = __mlx5_ib_dereg_mr(&mr->ibmr); + if (ret) + return ret; + + mutex_lock(&dev->data_direct_lock); + if (!dd_crossed_mr->revoked) + list_del(&dd_crossed_mr->dd_node); + + ret = __mlx5_ib_dereg_mr(&dd_crossed_mr->ibmr); + mutex_unlock(&dev->data_direct_lock); + return ret; +} + +int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + + if (mr->data_direct) + return dereg_crossing_data_direct_mr(dev, mr); + + return __mlx5_ib_dereg_mr(ibmr); +} + static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, int access_mode, int page_shift) { diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index a524181f34df..44a3428ea342 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -710,7 +710,10 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, ib_umem_dmabuf_unmap_pages(umem_dmabuf); err = -EINVAL; } else { - err = mlx5r_umr_update_mr_pas(mr, xlt_flags); + if (mr->data_direct) + err = mlx5r_umr_update_data_direct_ksm_pas(mr, xlt_flags); + else + err = mlx5r_umr_update_mr_pas(mr, xlt_flags); } dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c index ffc31b01f690..eb74c163fd83 100644 --- a/drivers/infiniband/hw/mlx5/umr.c +++ b/drivers/infiniband/hw/mlx5/umr.c @@ -632,44 +632,47 @@ static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev, wqe->data_seg.byte_count = cpu_to_be32(sg->length); } -/* - * Send the DMA list to the HW for a normal MR using UMR. - * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP - * flag may be used. - */ -int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) +static int +_mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd) { + size_t ent_size = dd ? sizeof(struct mlx5_ksm) : sizeof(struct mlx5_mtt); struct mlx5_ib_dev *dev = mr_to_mdev(mr); struct device *ddev = &dev->mdev->pdev->dev; struct mlx5r_umr_wqe wqe = {}; struct ib_block_iter biter; + struct mlx5_ksm *cur_ksm; struct mlx5_mtt *cur_mtt; size_t orig_sg_length; - struct mlx5_mtt *mtt; size_t final_size; + void *curr_entry; struct ib_sge sg; + void *entry; u64 offset = 0; int err = 0; - if (WARN_ON(mr->umem->is_odp)) - return -EINVAL; - - mtt = mlx5r_umr_create_xlt( - dev, &sg, ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift), - sizeof(*mtt), flags); - if (!mtt) + entry = mlx5r_umr_create_xlt(dev, &sg, + ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift), + ent_size, flags); + if (!entry) return -ENOMEM; orig_sg_length = sg.length; - mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg); mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr, mr->page_shift); + if (dd) { + /* Use the data direct internal kernel PD */ + MLX5_SET(mkc, &wqe.mkey_seg, pd, dev->ddr.pdn); + cur_ksm = entry; + } else { + cur_mtt = entry; + } + mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg); - cur_mtt = mtt; + curr_entry = entry; rdma_umem_for_each_dma_block(mr->umem, &biter, BIT(mr->page_shift)) { - if (cur_mtt == (void *)mtt + sg.length) { + if (curr_entry == entry + sg.length) { dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); @@ -681,23 +684,31 @@ int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) DMA_TO_DEVICE); offset += sg.length; mlx5r_umr_update_offset(&wqe.ctrl_seg, offset); - - cur_mtt = mtt; + if (dd) + cur_ksm = entry; + else + cur_mtt = entry; } - cur_mtt->ptag = - cpu_to_be64(rdma_block_iter_dma_address(&biter) | - MLX5_IB_MTT_PRESENT); - - if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP)) - cur_mtt->ptag = 0; - - cur_mtt++; + if (dd) { + cur_ksm->va = cpu_to_be64(rdma_block_iter_dma_address(&biter)); + cur_ksm->key = cpu_to_be32(dev->ddr.mkey); + cur_ksm++; + curr_entry = cur_ksm; + } else { + cur_mtt->ptag = + cpu_to_be64(rdma_block_iter_dma_address(&biter) | + MLX5_IB_MTT_PRESENT); + if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP)) + cur_mtt->ptag = 0; + cur_mtt++; + curr_entry = cur_mtt; + } } - final_size = (void *)cur_mtt - (void *)mtt; + final_size = curr_entry - entry; sg.length = ALIGN(final_size, MLX5_UMR_FLEX_ALIGNMENT); - memset(cur_mtt, 0, sg.length - final_size); + memset(curr_entry, 0, sg.length - final_size); mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags); dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); @@ -705,10 +716,32 @@ int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) err: sg.length = orig_sg_length; - mlx5r_umr_unmap_free_xlt(dev, mtt, &sg); + mlx5r_umr_unmap_free_xlt(dev, entry, &sg); return err; } +int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags) +{ + /* No invalidation flow is expected */ + if (WARN_ON(!mr->umem->is_dmabuf) || (flags & MLX5_IB_UPD_XLT_ZAP)) + return -EINVAL; + + return _mlx5r_umr_update_mr_pas(mr, flags, true); +} + +/* + * Send the DMA list to the HW for a normal MR using UMR. + * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP + * flag may be used. + */ +int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) +{ + if (WARN_ON(mr->umem->is_odp)) + return -EINVAL; + + return _mlx5r_umr_update_mr_pas(mr, flags, false); +} + static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev) { return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled); diff --git a/drivers/infiniband/hw/mlx5/umr.h b/drivers/infiniband/hw/mlx5/umr.h index 5f734dc72bef..4a02c9b5aad8 100644 --- a/drivers/infiniband/hw/mlx5/umr.h +++ b/drivers/infiniband/hw/mlx5/umr.h @@ -95,6 +95,7 @@ int mlx5r_umr_revoke_mr(struct mlx5_ib_mr *mr); int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd, int access_flags); int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags); +int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags); int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, int page_shift, int flags); diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 5b74d6534899..106276a4cce7 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -274,6 +274,10 @@ enum mlx5_ib_create_cq_attrs { MLX5_IB_ATTR_CREATE_CQ_UAR_INDEX = UVERBS_ID_DRIVER_NS_WITH_UHW, }; +enum mlx5_ib_reg_dmabuf_mr_attrs { + MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS = (1U << UVERBS_ID_NS_SHIFT), +}; + #define MLX5_IB_DW_MATCH_PARAM 0xA0 struct mlx5_ib_match_params { diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h index 3189c7f08d17..7c233df475e7 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h +++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h @@ -54,6 +54,10 @@ enum mlx5_ib_uapi_flow_action_packet_reformat_type { MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x3, }; +enum mlx5_ib_uapi_reg_dmabuf_flags { + MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT = 1 << 0, +}; + struct mlx5_ib_uapi_devx_async_cmd_hdr { __aligned_u64 wr_id; __u8 out_data[]; From ec7ad6530909983c8736c80af46e3529ce7bab55 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 1 Aug 2024 15:05:17 +0300 Subject: [PATCH 13/99] RDMA/mlx5: Introduce GET_DATA_DIRECT_SYSFS_PATH ioctl Introduce the 'GET_DATA_DIRECT_SYSFS_PATH' ioctl to return the sysfs path of the affiliated 'data direct' device for a given device. Signed-off-by: Yishai Hadas Link: https://patch.msgid.link/403745463e0ef52adbef681ff09aa6a29a756352.1722512548.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/std_types.c | 55 +++++++++++++++++++++++- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 5 +++ 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/std_types.c b/drivers/infiniband/hw/mlx5/std_types.c index 5c83765a85e0..bdb568411091 100644 --- a/drivers/infiniband/hw/mlx5/std_types.c +++ b/drivers/infiniband/hw/mlx5/std_types.c @@ -10,6 +10,7 @@ #include #include #include "mlx5_ib.h" +#include "data_direct.h" #define UVERBS_MODULE_NAME mlx5_ib #include @@ -204,6 +205,50 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_QUERY_PORT)( sizeof(info)); } +static int UVERBS_HANDLER(MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH)( + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_data_direct_dev *data_direct_dev; + struct mlx5_ib_ucontext *c; + struct mlx5_ib_dev *dev; + int out_len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH); + u32 dev_path_len; + char *dev_path; + int ret; + + c = to_mucontext(ib_uverbs_get_ucontext(attrs)); + if (IS_ERR(c)) + return PTR_ERR(c); + dev = to_mdev(c->ibucontext.device); + mutex_lock(&dev->data_direct_lock); + data_direct_dev = dev->data_direct_dev; + if (!data_direct_dev) { + ret = -ENODEV; + goto end; + } + + dev_path = kobject_get_path(&data_direct_dev->device->kobj, GFP_KERNEL); + if (!dev_path) { + ret = -ENOMEM; + goto end; + } + + dev_path_len = strlen(dev_path) + 1; + if (dev_path_len > out_len) { + ret = -ENOSPC; + goto end; + } + + ret = uverbs_copy_to(attrs, MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH, dev_path, + dev_path_len); + kfree(dev_path); + +end: + mutex_unlock(&dev->data_direct_lock); + return ret; +} + DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_QUERY_PORT, UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_QUERY_PORT_PORT_NUM, @@ -214,9 +259,17 @@ DECLARE_UVERBS_NAMED_METHOD( reg_c0), UA_MANDATORY)); +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH, + UVERBS_ATTR_PTR_OUT( + MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH, + UVERBS_ATTR_MIN_SIZE(0), + UA_MANDATORY)); + ADD_UVERBS_METHODS(mlx5_ib_device, UVERBS_OBJECT_DEVICE, - &UVERBS_METHOD(MLX5_IB_METHOD_QUERY_PORT)); + &UVERBS_METHOD(MLX5_IB_METHOD_QUERY_PORT), + &UVERBS_METHOD(MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH)); DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_PD_QUERY, diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 106276a4cce7..fd2e4a3a56b3 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -348,6 +348,7 @@ enum mlx5_ib_pd_methods { enum mlx5_ib_device_methods { MLX5_IB_METHOD_QUERY_PORT = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH, }; enum mlx5_ib_query_port_attrs { @@ -355,4 +356,8 @@ enum mlx5_ib_query_port_attrs { MLX5_IB_ATTR_QUERY_PORT, }; +enum mlx5_ib_get_data_direct_sysfs_path_attrs { + MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH = (1U << UVERBS_ID_NS_SHIFT), +}; + #endif From 53ffc09a3e6d39d7a9b3758be4a8795fb57a7989 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Fri, 16 Aug 2024 18:13:58 +0800 Subject: [PATCH 14/99] RDMA/mlx5: Remove two unused declarations Commit e6fb246ccafb ("RDMA/mlx5: Consolidate MR destruction to mlx5_ib_dereg_mr()") removed mlx5_ib_free_implicit_mr() but left the declaration. Commit d98995b4bf98 ("net/mlx5: Reimplement write combining test") left mlx5_ib_test_wc(). Remove the unused declarations. Link: https://patch.msgid.link/r/20240816101358.881247-1-yuehaibing@huawei.com Signed-off-by: Yue Haibing Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index be83a4d91a34..c0b1a9cd752b 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1372,7 +1372,6 @@ int mlx5_ib_alloc_mw(struct ib_mw *mw, struct ib_udata *udata); int mlx5_ib_dealloc_mw(struct ib_mw *mw); struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, int access_flags); -void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr); void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr); struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, u64 length, u64 virt_addr, int access_flags, @@ -1653,8 +1652,6 @@ static inline void mlx5r_deref_wait_odp_mkey(struct mlx5_ib_mkey *mmkey) wait_event(mmkey->wait, refcount_read(&mmkey->usecount) == 0); } -int mlx5_ib_test_wc(struct mlx5_ib_dev *dev); - static inline bool mlx5_ib_lag_should_assign_affinity(struct mlx5_ib_dev *dev) { /* From 1fb797af8a4bb0569c1bc8bf95b630ecaf7bd38e Mon Sep 17 00:00:00 2001 From: Zhang Zekun Date: Sun, 18 Aug 2024 13:57:01 +0800 Subject: [PATCH 15/99] RDMA/core: Remove unused declaration rdma_resolve_ip_route() The definition of rdma_resolve_ip_route() has been removed. Remove the unused declaration. Fixes: 6aaecd385685 ("RDMA/core: Simplify roce_resolve_route_from_path()") Link: https://patch.msgid.link/r/20240818055702.79547-2-zhangzekun11@huawei.com Signed-off-by: Zhang Zekun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/core_priv.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index dd7715ba9fd1..05102769a918 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -325,9 +325,6 @@ void ib_qp_usecnt_inc(struct ib_qp *qp); void ib_qp_usecnt_dec(struct ib_qp *qp); struct rdma_dev_addr; -int rdma_resolve_ip_route(struct sockaddr *src_addr, - const struct sockaddr *dst_addr, - struct rdma_dev_addr *addr); int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, From e2e641fe1c69bbbe94a89e814967da50e6df226b Mon Sep 17 00:00:00 2001 From: Zhang Zekun Date: Sun, 18 Aug 2024 13:57:02 +0800 Subject: [PATCH 16/99] RDMA/ipoib: Remove unused declarations There are some declarations without function definition, which are listed as below: 1. ipoib_ib_tx_timer_func() has also been removed since commit 8966e28d2e40 ("IB/ipoib: Use NAPI in UD/TX flows") 2. ipoib_pkey_event() has been removed since commit ee1e2c82c245 ("IPoIB: Refresh paths instead of flushing them on SM change events") 3. ipoib_mcast_dev_down() has been removed since commit 988bd50300ef ("IPoIB: Fix memory leak of multicast group structures") 4. ipoib_pkey_open() has been removed since commit dd57c9308aff ("IB/ipoib: Avoid multicast join attempts with invalid P_key") Remove these unused declarations. Link: https://patch.msgid.link/r/20240818055702.79547-3-zhangzekun11@huawei.com Signed-off-by: Zhang Zekun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/ipoib/ipoib.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 963e936da5e3..abe0522b7df4 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -509,12 +509,10 @@ struct net_device *ipoib_intf_alloc(struct ib_device *hca, u32 port, const char *format); int ipoib_intf_init(struct ib_device *hca, u32 port, const char *format, struct net_device *dev); -void ipoib_ib_tx_timer_func(struct timer_list *t); void ipoib_ib_dev_flush_light(struct work_struct *work); void ipoib_ib_dev_flush_normal(struct work_struct *work); void ipoib_ib_dev_flush_heavy(struct work_struct *work); void ipoib_ib_tx_timeout_work(struct work_struct *work); -void ipoib_pkey_event(struct work_struct *work); void ipoib_ib_dev_cleanup(struct net_device *dev); int ipoib_ib_dev_open_default(struct net_device *dev); @@ -533,7 +531,6 @@ void ipoib_mcast_restart_task(struct work_struct *work); void ipoib_mcast_start_thread(struct net_device *dev); void ipoib_mcast_stop_thread(struct net_device *dev); -void ipoib_mcast_dev_down(struct net_device *dev); void ipoib_mcast_dev_flush(struct net_device *dev); int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req); @@ -610,7 +607,6 @@ int ipoib_set_mode(struct net_device *dev, const char *buf); void ipoib_setup_common(struct net_device *dev); -void ipoib_pkey_open(struct ipoib_dev_priv *priv); void ipoib_drain_cq(struct net_device *dev); void ipoib_set_ethtool_ops(struct net_device *dev); From 92c7ad8364b2cad7e3c4842f3a77dda3b6e1fb88 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Fri, 23 Aug 2024 18:18:37 +0800 Subject: [PATCH 17/99] RDMA/qib: Simplify an alloc_ordered_workqueue() invocation Let alloc_ordered_workqueue() format the workqueue name instead of calling snprintf() explicitly. Link: https://patch.msgid.link/r/20240823101840.515398-2-ruanjinjie@huawei.com Signed-off-by: Jinjie Ruan Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qib/qib_init.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index db3b25c8433a..4100656fe9a3 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -581,12 +581,9 @@ static int qib_create_workqueues(struct qib_devdata *dd) for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; if (!ppd->qib_wq) { - char wq_name[23]; - - snprintf(wq_name, sizeof(wq_name), "qib%d_%d", - dd->unit, pidx); - ppd->qib_wq = alloc_ordered_workqueue(wq_name, - WQ_MEM_RECLAIM); + ppd->qib_wq = alloc_ordered_workqueue("qib%d_%d", + WQ_MEM_RECLAIM, + dd->unit, pidx); if (!ppd->qib_wq) goto wq_error; } From 7229d7b64e2e99fc2c7e9bdc8d5c687fad2e9bbb Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Fri, 23 Aug 2024 18:18:38 +0800 Subject: [PATCH 18/99] RDMA/mad: Simplify an alloc_ordered_workqueue() invocation Let alloc_ordered_workqueue() format the workqueue name instead of calling snprintf() explicitly. Link: https://patch.msgid.link/r/20240823101840.515398-3-ruanjinjie@huawei.com Signed-off-by: Jinjie Ruan Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/mad.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 70708fea1296..1fd54d5c4dd8 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -2939,7 +2939,6 @@ static int ib_mad_port_open(struct ib_device *device, int ret, cq_size; struct ib_mad_port_private *port_priv; unsigned long flags; - char name[sizeof "ib_mad123"]; int has_smi; if (WARN_ON(rdma_max_mad_size(device, port_num) < IB_MGMT_MAD_SIZE)) @@ -2992,8 +2991,8 @@ static int ib_mad_port_open(struct ib_device *device, goto error7; } - snprintf(name, sizeof(name), "ib_mad%u", port_num); - port_priv->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); + port_priv->wq = alloc_ordered_workqueue("ib_mad%u", WQ_MEM_RECLAIM, + port_num); if (!port_priv->wq) { ret = -ENOMEM; goto error8; From 87a55daa67795c5c01fa2c31bba6e784cf0c121b Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Fri, 23 Aug 2024 18:18:39 +0800 Subject: [PATCH 19/99] RDMA/mlx4: Simplify an alloc_ordered_workqueue() invocation Let alloc_ordered_workqueue() format the workqueue name instead of calling snprintf() explicitly. Link: https://patch.msgid.link/r/20240823101840.515398-4-ruanjinjie@huawei.com Signed-off-by: Jinjie Ruan Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/alias_GUID.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c index 9a439569ffcf..d7327735b8d0 100644 --- a/drivers/infiniband/hw/mlx4/alias_GUID.c +++ b/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -829,7 +829,6 @@ void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev) int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev) { - char alias_wq_name[22]; int ret = 0; int i, j; union ib_gid gid; @@ -875,9 +874,8 @@ int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev) dev->sriov.alias_guid.ports_guid[i].parent = &dev->sriov.alias_guid; dev->sriov.alias_guid.ports_guid[i].port = i; - snprintf(alias_wq_name, sizeof alias_wq_name, "alias_guid%d", i); dev->sriov.alias_guid.ports_guid[i].wq = - alloc_ordered_workqueue(alias_wq_name, WQ_MEM_RECLAIM); + alloc_ordered_workqueue("alias_guid%d", WQ_MEM_RECLAIM, i); if (!dev->sriov.alias_guid.ports_guid[i].wq) { ret = -ENOMEM; goto err_thread; From ae46d3fc17f960dc2d1399acdc488e27638a71a7 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Fri, 23 Aug 2024 18:18:40 +0800 Subject: [PATCH 20/99] RDMA/mlx4: Simplify an alloc_ordered_workqueue() invocation Let alloc_ordered_workqueue() format the workqueue name instead of calling snprintf() explicitly. Link: https://patch.msgid.link/r/20240823101840.515398-5-ruanjinjie@huawei.com Signed-off-by: Jinjie Ruan Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/mad.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index dc9cf45d2d32..e6e132f10625 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -2158,7 +2158,6 @@ static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev, struct mlx4_ib_demux_ctx *ctx, int port) { - char name[21]; int ret = 0; int i; @@ -2194,24 +2193,21 @@ static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev, goto err_mcg; } - snprintf(name, sizeof(name), "mlx4_ibt%d", port); - ctx->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); + ctx->wq = alloc_ordered_workqueue("mlx4_ibt%d", WQ_MEM_RECLAIM, port); if (!ctx->wq) { pr_err("Failed to create tunnelling WQ for port %d\n", port); ret = -ENOMEM; goto err_wq; } - snprintf(name, sizeof(name), "mlx4_ibwi%d", port); - ctx->wi_wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); + ctx->wi_wq = alloc_ordered_workqueue("mlx4_ibwi%d", WQ_MEM_RECLAIM, port); if (!ctx->wi_wq) { pr_err("Failed to create wire WQ for port %d\n", port); ret = -ENOMEM; goto err_wiwq; } - snprintf(name, sizeof(name), "mlx4_ibud%d", port); - ctx->ud_wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); + ctx->ud_wq = alloc_ordered_workqueue("mlx4_ibud%d", WQ_MEM_RECLAIM, port); if (!ctx->ud_wq) { pr_err("Failed to create up/down WQ for port %d\n", port); ret = -ENOMEM; From c87c5f47ff72a14001c982d3bc58d7b98ca746c3 Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Thu, 22 Aug 2024 14:52:21 +0800 Subject: [PATCH 21/99] RDMA/rxe: Use sizeof instead of hard code number Use 'sizeof(union rdma_network_hdr)' instead of hard code GRH length for GSI and UD. Link: https://patch.msgid.link/r/20240822065223.1117056-2-pizhenwei@bytedance.com Signed-off-by: zhenwei pi Reviewed-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_resp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 6596a85723c9..bf8f4bc8c5c8 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -351,7 +351,7 @@ static enum resp_states rxe_resp_check_length(struct rxe_qp *qp, for (i = 0; i < qp->resp.wqe->dma.num_sge; i++) recv_buffer_len += qp->resp.wqe->dma.sge[i].length; - if (payload + 40 > recv_buffer_len) { + if (payload + sizeof(union rdma_network_hdr) > recv_buffer_len) { rxe_dbg_qp(qp, "The receive buffer is too small for this UD packet.\n"); return RESPST_ERR_LENGTH; } From 938aa9a3334881cd817fdb096426d291a73c4d3d Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Thu, 22 Aug 2024 14:52:22 +0800 Subject: [PATCH 22/99] RDMA/rxe: Fix misspelling of 'rmda' Fix 'rmda' into 'RDMA'. Link: https://patch.msgid.link/r/20240822065223.1117056-3-pizhenwei@bytedance.com Signed-off-by: zhenwei pi Reviewed-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_resp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index bf8f4bc8c5c8..c11ab280551a 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -341,7 +341,7 @@ static enum resp_states rxe_resp_check_length(struct rxe_qp *qp, /* * See IBA C9-92 * For UD QPs we only check if the packet will fit in the - * receive buffer later. For rmda operations additional + * receive buffer later. For RDMA operations additional * length checks are performed in check_rkey. */ if ((qp_type(qp) == IB_QPT_GSI) || (qp_type(qp) == IB_QPT_UD)) { From 444948ee12c298b716635e8c4cd66fa5cd541592 Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Thu, 22 Aug 2024 14:52:23 +0800 Subject: [PATCH 23/99] RDMA/rxe: Fix __bth_set_resv6a __bth_set_resv6a is used to clear BIT [24, 29] of rxe_bth::qpn, the wrong expression leads other BITs into 1. Link: https://patch.msgid.link/r/20240822065223.1117056-4-pizhenwei@bytedance.com Signed-off-by: zhenwei pi Reviewed-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_hdr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_hdr.h b/drivers/infiniband/sw/rxe/rxe_hdr.h index 46f82b27fcd2..1f0322491d8c 100644 --- a/drivers/infiniband/sw/rxe/rxe_hdr.h +++ b/drivers/infiniband/sw/rxe/rxe_hdr.h @@ -234,7 +234,7 @@ static inline void __bth_set_resv6a(void *arg) { struct rxe_bth *bth = arg; - bth->qpn = cpu_to_be32(~BTH_RESV6A_MASK); + bth->qpn &= cpu_to_be32(~BTH_RESV6A_MASK); } static inline int __bth_ack(void *arg) From 86dfdd8288907f03c18b7fb462e0e232c4f98d89 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Tue, 20 Aug 2024 13:33:36 +0200 Subject: [PATCH 24/99] RDMA/iwcm: Fix WARNING:at_kernel/workqueue.c:#check_flush_dependency In the commit aee2424246f9 ("RDMA/iwcm: Fix a use-after-free related to destroying CM IDs"), the function flush_workqueue is invoked to flush the work queue iwcm_wq. But at that time, the work queue iwcm_wq was created via the function alloc_ordered_workqueue without the flag WQ_MEM_RECLAIM. Because the current process is trying to flush the whole iwcm_wq, if iwcm_wq doesn't have the flag WQ_MEM_RECLAIM, verify that the current process is not reclaiming memory or running on a workqueue which doesn't have the flag WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to a deadlock. The call trace is as below: [ 125.350876][ T1430] Call Trace: [ 125.356281][ T1430] [ 125.361285][ T1430] ? __warn (kernel/panic.c:693) [ 125.367640][ T1430] ? check_flush_dependency (kernel/workqueue.c:3706 (discriminator 9)) [ 125.375689][ T1430] ? report_bug (lib/bug.c:180 lib/bug.c:219) [ 125.382505][ T1430] ? handle_bug (arch/x86/kernel/traps.c:239) [ 125.388987][ T1430] ? exc_invalid_op (arch/x86/kernel/traps.c:260 (discriminator 1)) [ 125.395831][ T1430] ? asm_exc_invalid_op (arch/x86/include/asm/idtentry.h:621) [ 125.403125][ T1430] ? check_flush_dependency (kernel/workqueue.c:3706 (discriminator 9)) [ 125.410984][ T1430] ? check_flush_dependency (kernel/workqueue.c:3706 (discriminator 9)) [ 125.418764][ T1430] __flush_workqueue (kernel/workqueue.c:3970) [ 125.426021][ T1430] ? __pfx___might_resched (kernel/sched/core.c:10151) [ 125.433431][ T1430] ? destroy_cm_id (drivers/infiniband/core/iwcm.c:375) iw_cm [ 125.441209][ T1430] ? __pfx___flush_workqueue (kernel/workqueue.c:3910) [ 125.473900][ T1430] ? _raw_spin_lock_irqsave (arch/x86/include/asm/atomic.h:107 include/linux/atomic/atomic-arch-fallback.h:2170 include/linux/atomic/atomic-instrumented.h:1302 include/asm-generic/qspinlock.h:111 include/linux/spinlock.h:187 include/linux/spinlock_api_smp.h:111 kernel/locking/spinlock.c:162) [ 125.473909][ T1430] ? __pfx__raw_spin_lock_irqsave (kernel/locking/spinlock.c:161) [ 125.482537][ T1430] _destroy_id (drivers/infiniband/core/cma.c:2044) rdma_cm [ 125.495072][ T1430] nvme_rdma_free_queue (drivers/nvme/host/rdma.c:656 drivers/nvme/host/rdma.c:650) nvme_rdma [ 125.505827][ T1430] nvme_rdma_reset_ctrl_work (drivers/nvme/host/rdma.c:2180) nvme_rdma [ 125.505831][ T1430] process_one_work (kernel/workqueue.c:3231) [ 125.515122][ T1430] worker_thread (kernel/workqueue.c:3306 kernel/workqueue.c:3393) [ 125.515127][ T1430] ? __pfx_worker_thread (kernel/workqueue.c:3339) [ 125.531837][ T1430] kthread (kernel/kthread.c:389) [ 125.539864][ T1430] ? __pfx_kthread (kernel/kthread.c:342) [ 125.550628][ T1430] ret_from_fork (arch/x86/kernel/process.c:147) [ 125.558840][ T1430] ? __pfx_kthread (kernel/kthread.c:342) [ 125.558844][ T1430] ret_from_fork_asm (arch/x86/entry/entry_64.S:257) [ 125.566487][ T1430] [ 125.566488][ T1430] ---[ end trace 0000000000000000 ]--- Fixes: aee2424246f9 ("RDMA/iwcm: Fix a use-after-free related to destroying CM IDs") Link: https://patch.msgid.link/r/20240820113336.19860-1-yanjun.zhu@linux.dev Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202408151633.fc01893c-oliver.sang@intel.com Tested-by: kernel test robot Signed-off-by: Zhu Yanjun Reviewed-by: Bart Van Assche Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/iwcm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 1a6339f3a63f..7e3a55349e10 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -1182,7 +1182,7 @@ static int __init iw_cm_init(void) if (ret) return ret; - iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", 0); + iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", WQ_MEM_RECLAIM); if (!iwcm_wq) goto err_alloc; From 04e36fd27a2aebb03ef019debc4df247f2a427c6 Mon Sep 17 00:00:00 2001 From: Yehuda Yitschak Date: Thu, 22 Aug 2024 17:11:43 +0000 Subject: [PATCH 25/99] RDMA/efa: Add support for node guid Propagate the unique, per device, ID in the device attributes to the standard node_guid value in IB device. Link: https://patch.msgid.link/r/20240822171143.2800-1-mrgolin@amazon.com Reviewed-by: Yonatan Nachum Signed-off-by: Yehuda Yitschak Signed-off-by: Michael Margolin Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa_admin_cmds_defs.h | 3 +++ drivers/infiniband/hw/efa/efa_com_cmd.c | 1 + drivers/infiniband/hw/efa/efa_com_cmd.h | 1 + drivers/infiniband/hw/efa/efa_main.c | 1 + 4 files changed, 6 insertions(+) diff --git a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h index 4296662e59c3..cd03a5429beb 100644 --- a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h +++ b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h @@ -674,6 +674,9 @@ struct efa_admin_feature_device_attr_desc { /* Max RDMA transfer size in bytes */ u32 max_rdma_size; + + /* Unique global ID for an EFA device */ + u64 guid; }; struct efa_admin_feature_queue_attr_desc { diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c index 5b9c2b16df0e..5a774925cdea 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.c +++ b/drivers/infiniband/hw/efa/efa_com_cmd.c @@ -465,6 +465,7 @@ int efa_com_get_device_attr(struct efa_com_dev *edev, result->db_bar = resp.u.device_attr.db_bar; result->max_rdma_size = resp.u.device_attr.max_rdma_size; result->device_caps = resp.u.device_attr.device_caps; + result->guid = resp.u.device_attr.guid; if (result->admin_api_version < 1) { ibdev_err_ratelimited( diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.h b/drivers/infiniband/hw/efa/efa_com_cmd.h index 9714105fcf7e..668d033f7477 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.h +++ b/drivers/infiniband/hw/efa/efa_com_cmd.h @@ -112,6 +112,7 @@ struct efa_com_get_device_attr_result { u8 addr[EFA_GID_SIZE]; u64 page_size_cap; u64 max_mr_pages; + u64 guid; u32 mtu; u32 fw_version; u32 admin_api_version; diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c index 1a777791bea3..ad225823e6f2 100644 --- a/drivers/infiniband/hw/efa/efa_main.c +++ b/drivers/infiniband/hw/efa/efa_main.c @@ -441,6 +441,7 @@ static int efa_ib_device_add(struct efa_dev *dev) efa_set_host_info(dev); dev->ibdev.node_type = RDMA_NODE_UNSPECIFIED; + dev->ibdev.node_guid = dev->dev_attr.guid; dev->ibdev.phys_port_cnt = 1; dev->ibdev.num_comp_vectors = dev->neqs ?: 1; dev->ibdev.dev.parent = &pdev->dev; From de1d364c3815f9360a0945097ca2731950e914fa Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Sun, 18 Aug 2024 21:47:23 -0700 Subject: [PATCH 26/99] RDMA/bnxt_re: Add support for Variable WQE in Genp7 adapters Variable size WQE means that each send Work Queue Entry to HW can use different WQE sizes as opposed to the static WQE size on the current devices. Set variable WQE mode for Gen P7 devices. Depth of the Queue will be a multiple of slot which is 16 bytes. The number of slots should be a multiple of 256 as per the HW requirement. Initialize the Software shadow queue to hold requests equal to the number of slots. Also, do not expose the variable size WQE capability until the last patch in the series. Link: https://patch.msgid.link/r/1724042847-1481-2-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Hongguang Gao Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 8 +++++--- drivers/infiniband/hw/bnxt_re/main.c | 21 +++++++++++---------- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 18 +++++++++--------- drivers/infiniband/hw/bnxt_re/qplib_fp.h | 14 +++++++++++--- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 7 +++++-- drivers/infiniband/hw/bnxt_re/qplib_sp.h | 6 ++++++ 6 files changed, 47 insertions(+), 27 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 7c757351a016..5073ab18dc19 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -1156,6 +1156,7 @@ static struct bnxt_re_qp *bnxt_re_create_shadow_qp /* Shadow QP SQ depth should be same as QP1 RQ depth */ qp->qplib_qp.sq.wqe_size = bnxt_re_get_wqe_size(0, 6); qp->qplib_qp.sq.max_wqe = qp1_qp->rq.max_wqe; + qp->qplib_qp.sq.max_sw_wqe = qp1_qp->rq.max_wqe; qp->qplib_qp.sq.max_sge = 2; /* Q full delta can be 1 since it is internal QP */ qp->qplib_qp.sq.q_full_delta = 1; @@ -1167,6 +1168,7 @@ static struct bnxt_re_qp *bnxt_re_create_shadow_qp qp->qplib_qp.rq.wqe_size = bnxt_re_get_rwqe_size(6); qp->qplib_qp.rq.max_wqe = qp1_qp->rq.max_wqe; + qp->qplib_qp.rq.max_sw_wqe = qp1_qp->rq.max_wqe; qp->qplib_qp.rq.max_sge = qp1_qp->rq.max_sge; /* Q full delta can be 1 since it is internal QP */ qp->qplib_qp.rq.q_full_delta = 1; @@ -1228,6 +1230,7 @@ static int bnxt_re_init_rq_attr(struct bnxt_re_qp *qp, */ entries = bnxt_re_init_depth(init_attr->cap.max_recv_wr + 1, uctx); rq->max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + 1); + rq->max_sw_wqe = rq->max_wqe; rq->q_full_delta = 0; rq->sg_info.pgsize = PAGE_SIZE; rq->sg_info.pgshft = PAGE_SHIFT; @@ -1287,6 +1290,7 @@ static int bnxt_re_init_sq_attr(struct bnxt_re_qp *qp, 0 : BNXT_QPLIB_RESERVED_QP_WRS; entries = bnxt_re_init_depth(entries + diff + 1, uctx); sq->max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + diff + 1); + sq->max_sw_wqe = bnxt_qplib_get_depth(sq, qplqp->wqe_mode, true); sq->q_full_delta = diff + 1; /* * Reserving one slot for Phantom WQE. Application can @@ -2155,6 +2159,7 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, entries = bnxt_re_init_depth(qp_attr->cap.max_recv_wr, uctx); qp->qplib_qp.rq.max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + 1); + qp->qplib_qp.rq.max_sw_wqe = qp->qplib_qp.rq.max_wqe; qp->qplib_qp.rq.q_full_delta = qp->qplib_qp.rq.max_wqe - qp_attr->cap.max_recv_wr; qp->qplib_qp.rq.max_sge = qp_attr->cap.max_recv_sge; @@ -4187,9 +4192,6 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) resp.cqe_sz = sizeof(struct cq_base); resp.max_cqd = dev_attr->max_cq_wqes; - resp.comp_mask |= BNXT_RE_UCNTX_CMASK_HAVE_MODE; - resp.mode = rdev->chip_ctx->modes.wqe_mode; - if (rdev->chip_ctx->modes.db_push) resp.comp_mask |= BNXT_RE_UCNTX_CMASK_WC_DPI_ENABLED; diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 9714b9ab7524..31ba89cffe9d 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -129,13 +129,13 @@ static void bnxt_re_set_db_offset(struct bnxt_re_dev *rdev) } } -static void bnxt_re_set_drv_mode(struct bnxt_re_dev *rdev, u8 mode) +static void bnxt_re_set_drv_mode(struct bnxt_re_dev *rdev) { struct bnxt_qplib_chip_ctx *cctx; cctx = rdev->chip_ctx; - cctx->modes.wqe_mode = bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx) ? - mode : BNXT_QPLIB_WQE_MODE_STATIC; + cctx->modes.wqe_mode = bnxt_qplib_is_chip_gen_p7(rdev->chip_ctx) ? + BNXT_QPLIB_WQE_MODE_VARIABLE : BNXT_QPLIB_WQE_MODE_STATIC; if (bnxt_re_hwrm_qcaps(rdev)) dev_err(rdev_to_dev(rdev), "Failed to query hwrm qcaps\n"); @@ -158,7 +158,7 @@ static void bnxt_re_destroy_chip_ctx(struct bnxt_re_dev *rdev) kfree(chip_ctx); } -static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev, u8 wqe_mode) +static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev) { struct bnxt_qplib_chip_ctx *chip_ctx; struct bnxt_en_dev *en_dev; @@ -180,7 +180,7 @@ static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev, u8 wqe_mode) rdev->qplib_res.dattr = &rdev->dev_attr; rdev->qplib_res.is_vf = BNXT_EN_VF(en_dev); - bnxt_re_set_drv_mode(rdev, wqe_mode); + bnxt_re_set_drv_mode(rdev); bnxt_re_set_db_offset(rdev); rc = bnxt_qplib_map_db_bar(&rdev->qplib_res); @@ -1620,7 +1620,7 @@ static void bnxt_re_worker(struct work_struct *work) schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000)); } -static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode) +static int bnxt_re_dev_init(struct bnxt_re_dev *rdev) { struct bnxt_re_ring_attr rattr = {}; struct bnxt_qplib_creq_ctx *creq; @@ -1638,7 +1638,7 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode) } set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); - rc = bnxt_re_setup_chip_ctx(rdev, wqe_mode); + rc = bnxt_re_setup_chip_ctx(rdev); if (rc) { bnxt_unregister_dev(rdev->en_dev); clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); @@ -1790,7 +1790,7 @@ fail: return rc; } -static int bnxt_re_add_device(struct auxiliary_device *adev, u8 wqe_mode) +static int bnxt_re_add_device(struct auxiliary_device *adev) { struct bnxt_aux_priv *aux_priv = container_of(adev, struct bnxt_aux_priv, aux_dev); @@ -1807,7 +1807,7 @@ static int bnxt_re_add_device(struct auxiliary_device *adev, u8 wqe_mode) goto exit; } - rc = bnxt_re_dev_init(rdev, wqe_mode); + rc = bnxt_re_dev_init(rdev); if (rc) goto re_dev_dealloc; @@ -1937,7 +1937,8 @@ static int bnxt_re_probe(struct auxiliary_device *adev, int rc; mutex_lock(&bnxt_re_mutex); - rc = bnxt_re_add_device(adev, BNXT_QPLIB_WQE_MODE_STATIC); + + rc = bnxt_re_add_device(adev); if (rc) { mutex_unlock(&bnxt_re_mutex); return rc; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 49e4a4a50bfa..0af09e77e94b 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -809,13 +809,13 @@ static int bnxt_qplib_alloc_init_swq(struct bnxt_qplib_q *que) { int indx; - que->swq = kcalloc(que->max_wqe, sizeof(*que->swq), GFP_KERNEL); + que->swq = kcalloc(que->max_sw_wqe, sizeof(*que->swq), GFP_KERNEL); if (!que->swq) return -ENOMEM; que->swq_start = 0; - que->swq_last = que->max_wqe - 1; - for (indx = 0; indx < que->max_wqe; indx++) + que->swq_last = que->max_sw_wqe - 1; + for (indx = 0; indx < que->max_sw_wqe; indx++) que->swq[indx].next_idx = indx + 1; que->swq[que->swq_last].next_idx = 0; /* Make it circular */ que->swq_last = 0; @@ -851,7 +851,7 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) hwq_attr.res = res; hwq_attr.sginfo = &sq->sg_info; hwq_attr.stride = sizeof(struct sq_sge); - hwq_attr.depth = bnxt_qplib_get_depth(sq); + hwq_attr.depth = bnxt_qplib_get_depth(sq, qp->wqe_mode, false); hwq_attr.type = HWQ_TYPE_QUEUE; rc = bnxt_qplib_alloc_init_hwq(&sq->hwq, &hwq_attr); if (rc) @@ -879,7 +879,7 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) hwq_attr.res = res; hwq_attr.sginfo = &rq->sg_info; hwq_attr.stride = sizeof(struct sq_sge); - hwq_attr.depth = bnxt_qplib_get_depth(rq); + hwq_attr.depth = bnxt_qplib_get_depth(rq, qp->wqe_mode, false); hwq_attr.type = HWQ_TYPE_QUEUE; rc = bnxt_qplib_alloc_init_hwq(&rq->hwq, &hwq_attr); if (rc) @@ -1011,7 +1011,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) hwq_attr.res = res; hwq_attr.sginfo = &sq->sg_info; hwq_attr.stride = sizeof(struct sq_sge); - hwq_attr.depth = bnxt_qplib_get_depth(sq); + hwq_attr.depth = bnxt_qplib_get_depth(sq, qp->wqe_mode, true); hwq_attr.aux_stride = psn_sz; hwq_attr.aux_depth = psn_sz ? bnxt_qplib_set_sq_size(sq, qp->wqe_mode) : 0; @@ -1052,7 +1052,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) hwq_attr.res = res; hwq_attr.sginfo = &rq->sg_info; hwq_attr.stride = sizeof(struct sq_sge); - hwq_attr.depth = bnxt_qplib_get_depth(rq); + hwq_attr.depth = bnxt_qplib_get_depth(rq, qp->wqe_mode, false); hwq_attr.aux_stride = 0; hwq_attr.aux_depth = 0; hwq_attr.type = HWQ_TYPE_QUEUE; @@ -2492,7 +2492,7 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, } sq = &qp->sq; - cqe_sq_cons = le16_to_cpu(hwcqe->sq_cons_idx) % sq->max_wqe; + cqe_sq_cons = le16_to_cpu(hwcqe->sq_cons_idx) % sq->max_sw_wqe; if (qp->sq.flushed) { dev_dbg(&cq->hwq.pdev->dev, "%s: QP in Flush QP = %p\n", __func__, qp); @@ -2882,7 +2882,7 @@ static int bnxt_qplib_cq_process_terminal(struct bnxt_qplib_cq *cq, cqe_cons = le16_to_cpu(hwcqe->sq_cons_idx); if (cqe_cons == 0xFFFF) goto do_rq; - cqe_cons %= sq->max_wqe; + cqe_cons %= sq->max_sw_wqe; if (qp->sq.flushed) { dev_dbg(&cq->hwq.pdev->dev, diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index 56538b90d6c5..f54d7a0c7dad 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -251,6 +251,7 @@ struct bnxt_qplib_q { struct bnxt_qplib_db_info dbinfo; struct bnxt_qplib_sg_info sg_info; u32 max_wqe; + u32 max_sw_wqe; u16 wqe_size; u16 q_full_delta; u16 max_sge; @@ -586,15 +587,22 @@ static inline void bnxt_qplib_swq_mod_start(struct bnxt_qplib_q *que, u32 idx) que->swq_start = que->swq[idx].next_idx; } -static inline u32 bnxt_qplib_get_depth(struct bnxt_qplib_q *que) +static inline u32 bnxt_qplib_get_depth(struct bnxt_qplib_q *que, u8 wqe_mode, bool is_sq) { - return (que->wqe_size * que->max_wqe) / sizeof(struct sq_sge); + u32 slots; + + /* Queue depth is the number of slots. */ + slots = (que->wqe_size * que->max_wqe) / sizeof(struct sq_sge); + /* For variable WQE mode, need to align the slots to 256 */ + if (wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE && is_sq) + slots = ALIGN(slots, BNXT_VAR_MAX_SLOT_ALIGN); + return slots; } static inline u32 bnxt_qplib_set_sq_size(struct bnxt_qplib_q *que, u8 wqe_mode) { return (wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) ? - que->max_wqe : bnxt_qplib_get_depth(que); + que->max_wqe : bnxt_qplib_get_depth(que, wqe_mode, true); } static inline u32 bnxt_qplib_set_sq_max_slot(u8 wqe_mode) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 9328db92fa6d..ca2aa35e6eec 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -95,11 +95,13 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, struct bnxt_qplib_cmdqmsg msg = {}; struct creq_query_func_resp_sb *sb; struct bnxt_qplib_rcfw_sbuf sbuf; + struct bnxt_qplib_chip_ctx *cctx; struct cmdq_query_func req = {}; u8 *tqm_alloc; int i, rc; u32 temp; + cctx = rcfw->res->cctx; bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, CMDQ_BASE_OPCODE_QUERY_FUNC, sizeof(req)); @@ -133,8 +135,9 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, * reporting the max number */ attr->max_qp_wqes -= BNXT_QPLIB_RESERVED_QP_WRS + 1; - attr->max_qp_sges = bnxt_qplib_is_chip_gen_p5_p7(rcfw->res->cctx) ? - 6 : sb->max_sge; + + attr->max_qp_sges = cctx->modes.wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE ? + min_t(u32, sb->max_sge_var_wqe, BNXT_VAR_MAX_SGE) : 6; attr->max_cq = le32_to_cpu(sb->max_cq); attr->max_cq_wqes = le32_to_cpu(sb->max_cqe); attr->max_cq_sges = attr->max_qp_sges; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index 16a67d70a6fc..a633e2a9aa94 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -40,6 +40,7 @@ #ifndef __BNXT_QPLIB_SP_H__ #define __BNXT_QPLIB_SP_H__ +#include #define BNXT_QPLIB_RESERVED_QP_WRS 128 struct bnxt_qplib_dev_attr { @@ -351,4 +352,9 @@ int bnxt_qplib_qext_stat(struct bnxt_qplib_rcfw *rcfw, u32 fid, int bnxt_qplib_modify_cc(struct bnxt_qplib_res *res, struct bnxt_qplib_cc_param *cc_param); +#define BNXT_VAR_MAX_WQE 4352 +#define BNXT_VAR_MAX_SLOT_ALIGN 256 +#define BNXT_VAR_MAX_SGE 13 +#define BNXT_RE_MAX_RQ_WQES 65536 + #endif /* __BNXT_QPLIB_SP_H__*/ From 51edebb73497f5dc1da357947d6ac985f0158a1b Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Sun, 18 Aug 2024 21:47:24 -0700 Subject: [PATCH 27/99] RDMA/bnxt_re: Get the WQE index from slot index while completing the WQEs While reporting the completions, SQ Work Queue index is required to identify the WQE that generated the completions. In variable WQE mode, FW returns the slot index for Error completions. Driver need to walk through the shadow queue between the consumer index and producer index and matches the slot index returned by FW. If a match is found, the next index of the shadow queue is the WQE index to be considered for remaining poll_cq loop. Link: https://patch.msgid.link/r/1724042847-1481-3-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Hongguang Gao Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 43 +++++++++++++++++++++++- drivers/infiniband/hw/bnxt_re/qplib_fp.h | 10 ++++++ 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 0af09e77e94b..2810ffe3394b 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -2471,6 +2471,32 @@ out: return rc; } +static int bnxt_qplib_get_cqe_sq_cons(struct bnxt_qplib_q *sq, u32 cqe_slot) +{ + struct bnxt_qplib_hwq *sq_hwq; + struct bnxt_qplib_swq *swq; + int cqe_sq_cons = -1; + u32 start, last; + + sq_hwq = &sq->hwq; + + start = sq->swq_start; + last = sq->swq_last; + + while (last != start) { + swq = &sq->swq[last]; + if (swq->slot_idx == cqe_slot) { + cqe_sq_cons = swq->next_idx; + dev_err(&sq_hwq->pdev->dev, "%s: Found cons wqe = %d slot = %d\n", + __func__, cqe_sq_cons, cqe_slot); + break; + } + + last = swq->next_idx; + } + return cqe_sq_cons; +} + static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, struct cq_req *hwcqe, struct bnxt_qplib_cqe **pcqe, int *budget, @@ -2478,9 +2504,10 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, { struct bnxt_qplib_swq *swq; struct bnxt_qplib_cqe *cqe; + u32 cqe_sq_cons, slot_num; struct bnxt_qplib_qp *qp; struct bnxt_qplib_q *sq; - u32 cqe_sq_cons; + int cqe_cons; int rc = 0; qp = (struct bnxt_qplib_qp *)((unsigned long) @@ -2498,6 +2525,20 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, "%s: QP in Flush QP = %p\n", __func__, qp); goto done; } + + if (__is_err_cqe_for_var_wqe(qp, hwcqe->status)) { + slot_num = le16_to_cpu(hwcqe->sq_cons_idx); + cqe_cons = bnxt_qplib_get_cqe_sq_cons(sq, slot_num); + if (cqe_cons < 0) { + dev_err(&cq->hwq.pdev->dev, "%s: Wrong SQ cons cqe_slot_indx = %d\n", + __func__, slot_num); + goto done; + } + cqe_sq_cons = cqe_cons; + dev_err(&cq->hwq.pdev->dev, "%s: cqe_sq_cons = %d swq_last = %d swq_start = %d\n", + __func__, cqe_sq_cons, sq->swq_last, sq->swq_start); + } + /* Require to walk the sq's swq to fabricate CQEs for all previously * signaled SWQEs due to CQE aggregation from the current sq cons * to the cqe_sq_cons diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index f54d7a0c7dad..2e7a4fd651b8 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -649,4 +649,14 @@ static inline __le64 bnxt_re_update_msn_tbl(u32 st_idx, u32 npsn, u32 start_psn) (((start_psn) << SQ_MSN_SEARCH_START_PSN_SFT) & SQ_MSN_SEARCH_START_PSN_MASK)); } + +static inline bool __is_var_wqe(struct bnxt_qplib_qp *qp) +{ + return (qp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE); +} + +static inline bool __is_err_cqe_for_var_wqe(struct bnxt_qplib_qp *qp, u8 status) +{ + return (status != CQ_REQ_STATUS_OK) && __is_var_wqe(qp); +} #endif /* __BNXT_QPLIB_FP_H__ */ From b930d0bac9c671c053dd66229010ca9298e84aab Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Sun, 18 Aug 2024 21:47:25 -0700 Subject: [PATCH 28/99] RDMA/bnxt_re: Fix the table size for PSN/MSN entries HW MSN table size is always a power of 2. So the pages should be mapped accordingly. Use the power of two calculation while get the number of PSN/MSN entries. Fixes: 6f6bfbc595fb ("RDMA/bnxt_re: Expose the MSN table capability for user library") Link: https://patch.msgid.link/r/1724042847-1481-4-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 5073ab18dc19..4dd137b7a5ce 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -1042,6 +1042,8 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd, qplib_qp->sq.max_wqe : ((qplib_qp->sq.max_wqe * qplib_qp->sq.wqe_size) / sizeof(struct bnxt_qplib_sge)); + if (_is_host_msn_table(rdev->qplib_res.dattr->dev_cap_flags2)) + psn_nume = roundup_pow_of_two(psn_nume); bytes += (psn_nume * psn_sz); } From d8ea645d6984c84a87032063a0941f15a323831f Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Sun, 18 Aug 2024 21:47:26 -0700 Subject: [PATCH 29/99] RDMA/bnxt_re: Handle variable WQE support for user applications User library calculates the number of slots required for user applications and it can pass that information to the driver. Driver can use this value and update the HW directly. This mechanism is currently used only for the newly introduced variable size WQEs. Extend the bnxt_re_qp_req structure to pass the Send Queue slot count. Reorganize the code to get the sq_slots before initializing the Send Queue attributes. Link: https://patch.msgid.link/r/1724042847-1481-5-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Hongguang Gao Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 108 +++++++++++++---------- drivers/infiniband/hw/bnxt_re/ib_verbs.h | 16 +++- include/uapi/rdma/bnxt_re-abi.h | 6 ++ 3 files changed, 83 insertions(+), 47 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 4dd137b7a5ce..2932db129958 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -1017,20 +1017,15 @@ static int bnxt_re_setup_swqe_size(struct bnxt_re_qp *qp, } static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd, - struct bnxt_re_qp *qp, struct ib_udata *udata) + struct bnxt_re_qp *qp, struct bnxt_re_ucontext *cntx, + struct bnxt_re_qp_req *ureq) { struct bnxt_qplib_qp *qplib_qp; - struct bnxt_re_ucontext *cntx; - struct bnxt_re_qp_req ureq; int bytes = 0, psn_sz; struct ib_umem *umem; int psn_nume; qplib_qp = &qp->qplib_qp; - cntx = rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext, - ib_uctx); - if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) - return -EFAULT; bytes = (qplib_qp->sq.max_wqe * qplib_qp->sq.wqe_size); /* Consider mapping PSN search memory only for RC QPs. */ @@ -1038,17 +1033,20 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd, psn_sz = bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx) ? sizeof(struct sq_psn_search_ext) : sizeof(struct sq_psn_search); - psn_nume = (qplib_qp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) ? - qplib_qp->sq.max_wqe : - ((qplib_qp->sq.max_wqe * qplib_qp->sq.wqe_size) / - sizeof(struct bnxt_qplib_sge)); + if (cntx && bnxt_re_is_var_size_supported(rdev, cntx)) { + psn_nume = ureq->sq_slots; + } else { + psn_nume = (qplib_qp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) ? + qplib_qp->sq.max_wqe : ((qplib_qp->sq.max_wqe * qplib_qp->sq.wqe_size) / + sizeof(struct bnxt_qplib_sge)); + } if (_is_host_msn_table(rdev->qplib_res.dattr->dev_cap_flags2)) psn_nume = roundup_pow_of_two(psn_nume); bytes += (psn_nume * psn_sz); } bytes = PAGE_ALIGN(bytes); - umem = ib_umem_get(&rdev->ibdev, ureq.qpsva, bytes, + umem = ib_umem_get(&rdev->ibdev, ureq->qpsva, bytes, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(umem)) return PTR_ERR(umem); @@ -1057,12 +1055,12 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd, qplib_qp->sq.sg_info.umem = umem; qplib_qp->sq.sg_info.pgsize = PAGE_SIZE; qplib_qp->sq.sg_info.pgshft = PAGE_SHIFT; - qplib_qp->qp_handle = ureq.qp_handle; + qplib_qp->qp_handle = ureq->qp_handle; if (!qp->qplib_qp.srq) { bytes = (qplib_qp->rq.max_wqe * qplib_qp->rq.wqe_size); bytes = PAGE_ALIGN(bytes); - umem = ib_umem_get(&rdev->ibdev, ureq.qprva, bytes, + umem = ib_umem_get(&rdev->ibdev, ureq->qprva, bytes, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(umem)) goto rqfail; @@ -1261,14 +1259,15 @@ static void bnxt_re_adjust_gsi_rq_attr(struct bnxt_re_qp *qp) static int bnxt_re_init_sq_attr(struct bnxt_re_qp *qp, struct ib_qp_init_attr *init_attr, - struct bnxt_re_ucontext *uctx) + struct bnxt_re_ucontext *uctx, + struct bnxt_re_qp_req *ureq) { struct bnxt_qplib_dev_attr *dev_attr; struct bnxt_qplib_qp *qplqp; struct bnxt_re_dev *rdev; struct bnxt_qplib_q *sq; + int diff = 0; int entries; - int diff; int rc; rdev = qp->rdev; @@ -1277,22 +1276,28 @@ static int bnxt_re_init_sq_attr(struct bnxt_re_qp *qp, dev_attr = &rdev->dev_attr; sq->max_sge = init_attr->cap.max_send_sge; - if (sq->max_sge > dev_attr->max_qp_sges) { - sq->max_sge = dev_attr->max_qp_sges; - init_attr->cap.max_send_sge = sq->max_sge; - } - - rc = bnxt_re_setup_swqe_size(qp, init_attr); - if (rc) - return rc; - entries = init_attr->cap.max_send_wr; - /* Allocate 128 + 1 more than what's provided */ - diff = (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE) ? - 0 : BNXT_QPLIB_RESERVED_QP_WRS; - entries = bnxt_re_init_depth(entries + diff + 1, uctx); - sq->max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + diff + 1); - sq->max_sw_wqe = bnxt_qplib_get_depth(sq, qplqp->wqe_mode, true); + if (uctx && qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE) { + sq->max_wqe = ureq->sq_slots; + sq->max_sw_wqe = ureq->sq_slots; + sq->wqe_size = sizeof(struct sq_sge); + } else { + if (sq->max_sge > dev_attr->max_qp_sges) { + sq->max_sge = dev_attr->max_qp_sges; + init_attr->cap.max_send_sge = sq->max_sge; + } + + rc = bnxt_re_setup_swqe_size(qp, init_attr); + if (rc) + return rc; + + /* Allocate 128 + 1 more than what's provided */ + diff = (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE) ? + 0 : BNXT_QPLIB_RESERVED_QP_WRS; + entries = bnxt_re_init_depth(entries + diff + 1, uctx); + sq->max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + diff + 1); + sq->max_sw_wqe = bnxt_qplib_get_depth(sq, qplqp->wqe_mode, true); + } sq->q_full_delta = diff + 1; /* * Reserving one slot for Phantom WQE. Application can @@ -1355,10 +1360,10 @@ out: static int bnxt_re_init_qp_attr(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd, struct ib_qp_init_attr *init_attr, - struct ib_udata *udata) + struct bnxt_re_ucontext *uctx, + struct bnxt_re_qp_req *ureq) { struct bnxt_qplib_dev_attr *dev_attr; - struct bnxt_re_ucontext *uctx; struct bnxt_qplib_qp *qplqp; struct bnxt_re_dev *rdev; struct bnxt_re_cq *cq; @@ -1368,7 +1373,6 @@ static int bnxt_re_init_qp_attr(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd, qplqp = &qp->qplib_qp; dev_attr = &rdev->dev_attr; - uctx = rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext, ib_uctx); /* Setup misc params */ ether_addr_copy(qplqp->smac, rdev->netdev->dev_addr); qplqp->pd = &pd->qplib_pd; @@ -1381,8 +1385,7 @@ static int bnxt_re_init_qp_attr(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd, goto out; } qplqp->type = (u8)qptype; - qplqp->wqe_mode = rdev->chip_ctx->modes.wqe_mode; - + qplqp->wqe_mode = bnxt_re_is_var_size_supported(rdev, uctx); if (init_attr->qp_type == IB_QPT_RC) { qplqp->max_rd_atomic = dev_attr->max_qp_rd_atom; qplqp->max_dest_rd_atomic = dev_attr->max_qp_init_rd_atom; @@ -1417,14 +1420,14 @@ static int bnxt_re_init_qp_attr(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd, bnxt_re_adjust_gsi_rq_attr(qp); /* Setup SQ */ - rc = bnxt_re_init_sq_attr(qp, init_attr, uctx); + rc = bnxt_re_init_sq_attr(qp, init_attr, uctx, ureq); if (rc) goto out; if (init_attr->qp_type == IB_QPT_GSI) bnxt_re_adjust_gsi_sq_attr(qp, init_attr, uctx); - if (udata) /* This will update DPI and qp_handle */ - rc = bnxt_re_init_user_qp(rdev, pd, qp, udata); + if (uctx) /* This will update DPI and qp_handle */ + rc = bnxt_re_init_user_qp(rdev, pd, qp, uctx, ureq); out: return rc; } @@ -1525,14 +1528,27 @@ static bool bnxt_re_test_qp_limits(struct bnxt_re_dev *rdev, int bnxt_re_create_qp(struct ib_qp *ib_qp, struct ib_qp_init_attr *qp_init_attr, struct ib_udata *udata) { - struct ib_pd *ib_pd = ib_qp->pd; - struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); - struct bnxt_re_dev *rdev = pd->rdev; - struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; - struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp); + struct bnxt_qplib_dev_attr *dev_attr; + struct bnxt_re_ucontext *uctx; + struct bnxt_re_qp_req ureq; + struct bnxt_re_dev *rdev; + struct bnxt_re_pd *pd; + struct bnxt_re_qp *qp; + struct ib_pd *ib_pd; u32 active_qps; int rc; + ib_pd = ib_qp->pd; + pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); + rdev = pd->rdev; + dev_attr = &rdev->dev_attr; + qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp); + + uctx = rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext, ib_uctx); + if (udata) + if (ib_copy_from_udata(&ureq, udata, min(udata->inlen, sizeof(ureq)))) + return -EFAULT; + rc = bnxt_re_test_qp_limits(rdev, qp_init_attr, dev_attr); if (!rc) { rc = -EINVAL; @@ -1540,7 +1556,7 @@ int bnxt_re_create_qp(struct ib_qp *ib_qp, struct ib_qp_init_attr *qp_init_attr, } qp->rdev = rdev; - rc = bnxt_re_init_qp_attr(qp, pd, qp_init_attr, udata); + rc = bnxt_re_init_qp_attr(qp, pd, qp_init_attr, uctx, &ureq); if (rc) goto fail; @@ -4215,7 +4231,7 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) goto cfail; if (ureq.comp_mask & BNXT_RE_COMP_MASK_REQ_UCNTX_POW2_SUPPORT) { resp.comp_mask |= BNXT_RE_UCNTX_CMASK_POW2_DISABLED; - uctx->cmask |= BNXT_RE_UCNTX_CMASK_POW2_DISABLED; + uctx->cmask |= BNXT_RE_UCNTX_CAP_POW2_DISABLED; } } diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index e98cb1717338..7c8350fb8aad 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -171,12 +171,26 @@ static inline u16 bnxt_re_get_rwqe_size(int nsge) return sizeof(struct rq_wqe_hdr) + (nsge * sizeof(struct sq_sge)); } +enum { + BNXT_RE_UCNTX_CAP_POW2_DISABLED = 0x1ULL, + BNXT_RE_UCNTX_CAP_VAR_WQE_ENABLED = 0x2ULL, +}; + static inline u32 bnxt_re_init_depth(u32 ent, struct bnxt_re_ucontext *uctx) { - return uctx ? (uctx->cmask & BNXT_RE_UCNTX_CMASK_POW2_DISABLED) ? + return uctx ? (uctx->cmask & BNXT_RE_UCNTX_CAP_POW2_DISABLED) ? ent : roundup_pow_of_two(ent) : ent; } +static inline bool bnxt_re_is_var_size_supported(struct bnxt_re_dev *rdev, + struct bnxt_re_ucontext *uctx) +{ + if (uctx) + return uctx->cmask & BNXT_RE_UCNTX_CAP_VAR_WQE_ENABLED; + else + return rdev->chip_ctx->modes.wqe_mode; +} + int bnxt_re_query_device(struct ib_device *ibdev, struct ib_device_attr *ib_attr, struct ib_udata *udata); diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h index e61104f35d73..71140618700a 100644 --- a/include/uapi/rdma/bnxt_re-abi.h +++ b/include/uapi/rdma/bnxt_re-abi.h @@ -118,10 +118,16 @@ struct bnxt_re_resize_cq_req { __aligned_u64 cq_va; }; +enum bnxt_re_qp_mask { + BNXT_RE_QP_REQ_MASK_VAR_WQE_SQ_SLOTS = 0x1, +}; + struct bnxt_re_qp_req { __aligned_u64 qpsva; __aligned_u64 qprva; __aligned_u64 qp_handle; + __aligned_u64 comp_mask; + __u32 sq_slots; }; struct bnxt_re_qp_resp { From 10a104c0debbb19a1e45193d5670510216e339ff Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Sun, 18 Aug 2024 21:47:27 -0700 Subject: [PATCH 30/99] RDMA/bnxt_re: Enable variable size WQEs for user space applications Add backward compatibility code to enable variable size WQEs only if the user lib supports it. Link: https://patch.msgid.link/r/1724042847-1481-6-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Hongguang Gao Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 5 +++++ include/uapi/rdma/bnxt_re-abi.h | 1 + 2 files changed, 6 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 2932db129958..82444fd748f1 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -4233,6 +4233,11 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) resp.comp_mask |= BNXT_RE_UCNTX_CMASK_POW2_DISABLED; uctx->cmask |= BNXT_RE_UCNTX_CAP_POW2_DISABLED; } + if (ureq.comp_mask & BNXT_RE_COMP_MASK_REQ_UCNTX_VAR_WQE_SUPPORT) { + resp.comp_mask |= BNXT_RE_UCNTX_CMASK_HAVE_MODE; + resp.mode = rdev->chip_ctx->modes.wqe_mode; + uctx->cmask |= BNXT_RE_UCNTX_CAP_VAR_WQE_ENABLED; + } } rc = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp))); diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h index 71140618700a..6821002931c8 100644 --- a/include/uapi/rdma/bnxt_re-abi.h +++ b/include/uapi/rdma/bnxt_re-abi.h @@ -66,6 +66,7 @@ enum bnxt_re_wqe_mode { enum { BNXT_RE_COMP_MASK_REQ_UCNTX_POW2_SUPPORT = 0x01, + BNXT_RE_COMP_MASK_REQ_UCNTX_VAR_WQE_SUPPORT = 0x02, }; struct bnxt_re_uctx_req { From 4842cfb07a4faa4b9768e496925e48b50875ee13 Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Wed, 21 Aug 2024 13:22:07 +0200 Subject: [PATCH 31/99] RDMA/rtrs: For HB error add additional clt/srv specific logging In case of HB error, we need to know the specific path on which it happened, for better debugging. Since the clt/srv path structures are not available in rtrs.c, it needs to be done in the individual HB error handler. This commit add those loging. A sample kernel log output after this commit: rtrs_core L357: : HB missed max reached. rtrs_server L717: : HB err handler for path=ip:x.x.x.x@ip:x.x.x.x . . rtrs_core L357: : HB missed max reached. rtrs_client L1519: : HB err handler for path=ip:x.x.x.x@ip:x.x.x.x Signed-off-by: Md Haris Iqbal Reviewed-by: Jack Wang Signed-off-by: Grzegorz Prajsner Link: https://patch.msgid.link/20240821112217.41827-2-haris.iqbal@ionos.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 2 ++ drivers/infiniband/ulp/rtrs/rtrs-srv.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 88106cf5ce55..66ac4dba990f 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -1494,7 +1494,9 @@ static bool rtrs_clt_change_state_get_old(struct rtrs_clt_path *clt_path, static void rtrs_clt_hb_err_handler(struct rtrs_con *c) { struct rtrs_clt_con *con = container_of(c, typeof(*con), c); + struct rtrs_clt_path *clt_path = to_clt_path(con->c.path); + rtrs_err(con->c.path, "HB err handler for path=%s\n", kobject_name(&clt_path->kobj)); rtrs_rdma_error_recovery(con); } diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index 1d33efb8fb03..f76d483c3784 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -672,6 +672,10 @@ err: static void rtrs_srv_hb_err_handler(struct rtrs_con *c) { + struct rtrs_srv_con *con = container_of(c, typeof(*con), c); + struct rtrs_srv_path *srv_path = to_srv_path(con->c.path); + + rtrs_err(con->c.path, "HB err handler for path=%s\n", kobject_name(&srv_path->kobj)); close_path(to_srv_path(c->path)); } From 8c8dd4e13bd5a716a0a2fa58e7e2f08b4af073f7 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Wed, 21 Aug 2024 13:22:08 +0200 Subject: [PATCH 32/99] RDMA/rtrs-clt: Fix need_inv setting in error case In some cases need_inv can be missed for write requests, additionally driver has to handle missing invalidates for write requests. While at it, remove the else case from write invalidate path as it is possible to reach there. Signed-off-by: Jack Wang Signed-off-by: Md Haris Iqbal Signed-off-by: Grzegorz Prajsner Link: https://patch.msgid.link/20240821112217.41827-3-haris.iqbal@ionos.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 66ac4dba990f..d09018c11ece 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -391,11 +391,12 @@ static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno, clt_path = to_clt_path(con->c.path); if (req->sg_cnt) { - if (req->dir == DMA_FROM_DEVICE && req->need_inv) { + if (req->need_inv) { /* - * We are here to invalidate read requests + * We are here to invalidate read/write requests * ourselves. In normal scenario server should - * send INV for all read requests, but + * send INV for all read requests, we do chained local + * invalidate for write requests, but * we are here, thus two things could happen: * * 1. this is failover, when errno != 0 @@ -422,14 +423,6 @@ static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno, req->mr->rkey, err); } else if (can_wait) { wait_for_completion(&req->inv_comp); - } else { - /* - * Something went wrong, so request will be - * completed from INV callback. - */ - WARN_ON_ONCE(1); - - return; } if (!refcount_dec_and_test(&req->ref)) return; @@ -1146,6 +1139,7 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req) }; wr = &rwr.wr; fr_en = true; + req->need_inv = true; refcount_inc(&req->ref); } /* @@ -1164,6 +1158,10 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req) clt_path->hca_port); if (req->mp_policy == MP_POLICY_MIN_INFLIGHT) atomic_dec(&clt_path->stats->inflight); + if (req->need_inv) { + req->need_inv = false; + refcount_dec(&req->ref); + } if (req->sg_cnt) ib_dma_unmap_sg(clt_path->s.dev->ib_dev, req->sglist, req->sg_cnt, req->dir); From 53c26f3ecd59e8024692cea339c35f9aceb0f178 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Wed, 21 Aug 2024 13:22:09 +0200 Subject: [PATCH 33/99] RDMA/rtrs-clt: Rate limit errors in IO path On network errors, a large number of these logs are printed due to all the inflight IOs, rate limit them so they do not clutter kernel log. Signed-off-by: Jack Wang Signed-off-by: Md Haris Iqbal Signed-off-by: Grzegorz Prajsner Link: https://patch.msgid.link/20240821112217.41827-4-haris.iqbal@ionos.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index d09018c11ece..b34eb4908185 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -331,7 +331,7 @@ static void rtrs_clt_fast_reg_done(struct ib_cq *cq, struct ib_wc *wc) struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context); if (wc->status != IB_WC_SUCCESS) { - rtrs_err(con->c.path, "Failed IB_WR_REG_MR: %s\n", + rtrs_err_rl(con->c.path, "Failed IB_WR_REG_MR: %s\n", ib_wc_status_msg(wc->status)); rtrs_rdma_error_recovery(con); } @@ -351,7 +351,7 @@ static void rtrs_clt_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc) struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context); if (wc->status != IB_WC_SUCCESS) { - rtrs_err(con->c.path, "Failed IB_WR_LOCAL_INV: %s\n", + rtrs_err_rl(con->c.path, "Failed IB_WR_LOCAL_INV: %s\n", ib_wc_status_msg(wc->status)); rtrs_rdma_error_recovery(con); } @@ -419,7 +419,7 @@ static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno, refcount_inc(&req->ref); err = rtrs_inv_rkey(req); if (err) { - rtrs_err(con->c.path, "Send INV WR key=%#x: %d\n", + rtrs_err_rl(con->c.path, "Send INV WR key=%#x: %d\n", req->mr->rkey, err); } else if (can_wait) { wait_for_completion(&req->inv_comp); From 3258cbbd86deaa2675e1799bc3d18bd1ef472641 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Wed, 21 Aug 2024 13:22:10 +0200 Subject: [PATCH 34/99] RDMA/rtrs: Reset hb_missed_cnt after receiving other traffic from peer Reset hb_missed_cnt after receiving traffic from other peer, so hb is more robust again high load on host or network. Fixes: 6a98d71daea1 ("RDMA/rtrs: client: main functionality") Signed-off-by: Jack Wang Signed-off-by: Md Haris Iqbal Signed-off-by: Grzegorz Prajsner Link: https://patch.msgid.link/20240821112217.41827-5-haris.iqbal@ionos.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 3 ++- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index b34eb4908185..c1bca8972015 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -619,6 +619,7 @@ static void rtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc) */ if (WARN_ON(wc->wr_cqe->done != rtrs_clt_rdma_done)) return; + clt_path->s.hb_missed_cnt = 0; rtrs_from_imm(be32_to_cpu(wc->ex.imm_data), &imm_type, &imm_payload); if (imm_type == RTRS_IO_RSP_IMM || @@ -636,7 +637,6 @@ static void rtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc) return rtrs_clt_recv_done(con, wc); } else if (imm_type == RTRS_HB_ACK_IMM) { WARN_ON(con->c.cid); - clt_path->s.hb_missed_cnt = 0; clt_path->s.hb_cur_latency = ktime_sub(ktime_get(), clt_path->s.hb_last_sent); if (clt_path->flags & RTRS_MSG_NEW_RKEY_F) @@ -663,6 +663,7 @@ static void rtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc) /* * Key invalidations from server side */ + clt_path->s.hb_missed_cnt = 0; WARN_ON(!(wc->wc_flags & IB_WC_WITH_INVALIDATE || wc->wc_flags & IB_WC_WITH_IMM)); WARN_ON(wc->wr_cqe->done != rtrs_clt_rdma_done); diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index f76d483c3784..ffd3e80596d0 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -1233,6 +1233,7 @@ static void rtrs_srv_rdma_done(struct ib_cq *cq, struct ib_wc *wc) */ if (WARN_ON(wc->wr_cqe != &io_comp_cqe)) return; + srv_path->s.hb_missed_cnt = 0; err = rtrs_post_recv_empty(&con->c, &io_comp_cqe); if (err) { rtrs_err(s, "rtrs_post_recv(), err: %d\n", err); From 6793f9581f75d98c5ba0753e863a5442a9dff07c Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Wed, 21 Aug 2024 13:22:11 +0200 Subject: [PATCH 35/99] RDMA/rtrs-clt: Reuse need_inval from mr mr has a member need_inval, which can be used to indicate if local invalidate is needed, switch to it and remove need_inv from rtrs_clt_io_req. Signed-off-by: Jack Wang Signed-off-by: Md Haris Iqbal Signed-off-by: Grzegorz Prajsner Link: https://patch.msgid.link/20240821112217.41827-6-haris.iqbal@ionos.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 18 +++++++++--------- drivers/infiniband/ulp/rtrs/rtrs-clt.h | 1 - 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index c1bca8972015..e1557b0cda05 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -355,7 +355,7 @@ static void rtrs_clt_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc) ib_wc_status_msg(wc->status)); rtrs_rdma_error_recovery(con); } - req->need_inv = false; + req->mr->need_inval = false; if (req->need_inv_comp) complete(&req->inv_comp); else @@ -391,7 +391,7 @@ static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno, clt_path = to_clt_path(con->c.path); if (req->sg_cnt) { - if (req->need_inv) { + if (req->mr->need_inval) { /* * We are here to invalidate read/write requests * ourselves. In normal scenario server should @@ -494,7 +494,7 @@ static void process_io_rsp(struct rtrs_clt_path *clt_path, u32 msg_id, req = &clt_path->reqs[msg_id]; /* Drop need_inv if server responded with send with invalidation */ - req->need_inv &= !w_inval; + req->mr->need_inval &= !w_inval; complete_rdma_req(req, errno, true, false); } @@ -961,7 +961,7 @@ static void rtrs_clt_init_req(struct rtrs_clt_io_req *req, req->dir = dir; req->con = rtrs_permit_to_clt_con(clt_path, permit); req->conf = conf; - req->need_inv = false; + req->mr->need_inval = false; req->need_inv_comp = false; req->inv_errno = 0; refcount_set(&req->ref, 1); @@ -1140,8 +1140,8 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req) }; wr = &rwr.wr; fr_en = true; - req->need_inv = true; refcount_inc(&req->ref); + req->mr->need_inval = true; } /* * Update stats now, after request is successfully sent it is not @@ -1159,8 +1159,8 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req) clt_path->hca_port); if (req->mp_policy == MP_POLICY_MIN_INFLIGHT) atomic_dec(&clt_path->stats->inflight); - if (req->need_inv) { - req->need_inv = false; + if (req->mr->need_inval) { + req->mr->need_inval = false; refcount_dec(&req->ref); } if (req->sg_cnt) @@ -1236,7 +1236,7 @@ static int rtrs_clt_read_req(struct rtrs_clt_io_req *req) msg->desc[0].len = cpu_to_le32(req->mr->length); /* Further invalidation is required */ - req->need_inv = !!RTRS_MSG_NEED_INVAL_F; + req->mr->need_inval = !!RTRS_MSG_NEED_INVAL_F; } else { msg->sg_cnt = 0; @@ -1269,7 +1269,7 @@ static int rtrs_clt_read_req(struct rtrs_clt_io_req *req) clt_path->hca_port); if (req->mp_policy == MP_POLICY_MIN_INFLIGHT) atomic_dec(&clt_path->stats->inflight); - req->need_inv = false; + req->mr->need_inval = false; if (req->sg_cnt) ib_dma_unmap_sg(dev->ib_dev, req->sglist, req->sg_cnt, req->dir); diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.h b/drivers/infiniband/ulp/rtrs/rtrs-clt.h index f848c0392d98..45dac15825f4 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.h +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.h @@ -115,7 +115,6 @@ struct rtrs_clt_io_req { struct completion inv_comp; int inv_errno; bool need_inv_comp; - bool need_inv; refcount_t ref; }; From 3e4289b29e216a55d08a89e126bc0b37cbad9f38 Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Wed, 21 Aug 2024 13:22:12 +0200 Subject: [PATCH 36/99] RDMA/rtrs-clt: Reset cid to con_num - 1 to stay in bounds In the function init_conns(), after the create_con() and create_cm() for loop if something fails. In the cleanup for loop after the destroy tag, we access out of bound memory because cid is set to clt_path->s.con_num. This commits resets the cid to clt_path->s.con_num - 1, to stay in bounds in the cleanup loop later. Fixes: 6a98d71daea1 ("RDMA/rtrs: client: main functionality") Signed-off-by: Md Haris Iqbal Signed-off-by: Jack Wang Signed-off-by: Grzegorz Prajsner Link: https://patch.msgid.link/20240821112217.41827-7-haris.iqbal@ionos.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index e1557b0cda05..777f8e52ed7c 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -2347,6 +2347,12 @@ static int init_conns(struct rtrs_clt_path *clt_path) if (err) goto destroy; } + + /* + * Set the cid to con_num - 1, since if we fail later, we want to stay in bounds. + */ + cid = clt_path->s.con_num - 1; + err = alloc_path_reqs(clt_path); if (err) goto destroy; From ff7395890580447507b27851c06e396f75913443 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Wed, 21 Aug 2024 13:22:13 +0200 Subject: [PATCH 37/99] RDMA/rtrs-clt: Print request type for errors Extend the output to print also the request type. Signed-off-by: Jack Wang Signed-off-by: Grzegorz Prajsner Signed-off-by: Md Haris Iqbal Link: https://patch.msgid.link/20240821112217.41827-8-haris.iqbal@ionos.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 777f8e52ed7c..7c6d40380638 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -439,8 +439,10 @@ static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno, req->con = NULL; if (errno) { - rtrs_err_rl(con->c.path, "IO request failed: error=%d path=%s [%s:%u] notify=%d\n", - errno, kobject_name(&clt_path->kobj), clt_path->hca_name, + rtrs_err_rl(con->c.path, + "IO %s request failed: error=%d path=%s [%s:%u] notify=%d\n", + req->dir == DMA_TO_DEVICE ? "write" : "read", errno, + kobject_name(&clt_path->kobj), clt_path->hca_name, clt_path->hca_port, notify); } From d0e62bf7b575fbfe591f6f570e7595dd60a2f5eb Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Wed, 21 Aug 2024 13:22:14 +0200 Subject: [PATCH 38/99] RDMA/rtrs-srv: Avoid null pointer deref during path establishment For RTRS path establishment, RTRS client initiates and completes con_num of connections. After establishing all its connections, the information is exchanged between the client and server through the info_req message. During this exchange, it is essential that all connections have been established, and the state of the RTRS srv path is CONNECTED. So add these sanity checks, to make sure we detect and abort process in error scenarios to avoid null pointer deref. Signed-off-by: Md Haris Iqbal Signed-off-by: Jack Wang Signed-off-by: Grzegorz Prajsner Link: https://patch.msgid.link/20240821112217.41827-9-haris.iqbal@ionos.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index ffd3e80596d0..05d15ff074bb 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -935,12 +935,11 @@ static void rtrs_srv_info_req_done(struct ib_cq *cq, struct ib_wc *wc) if (err) goto close; -out: rtrs_iu_free(iu, srv_path->s.dev->ib_dev, 1); return; close: + rtrs_iu_free(iu, srv_path->s.dev->ib_dev, 1); close_path(srv_path); - goto out; } static int post_recv_info_req(struct rtrs_srv_con *con) @@ -991,6 +990,16 @@ static int post_recv_path(struct rtrs_srv_path *srv_path) q_size = SERVICE_CON_QUEUE_DEPTH; else q_size = srv->queue_depth; + if (srv_path->state != RTRS_SRV_CONNECTING) { + rtrs_err(s, "Path state invalid. state %s\n", + rtrs_srv_state_str(srv_path->state)); + return -EIO; + } + + if (!srv_path->s.con[cid]) { + rtrs_err(s, "Conn not set for %d\n", cid); + return -EIO; + } err = post_recv_io(to_srv_con(srv_path->s.con[cid]), q_size); if (err) { From 667db86bcbe82e789d82c2e8c8c40756ec2e1999 Mon Sep 17 00:00:00 2001 From: Grzegorz Prajsner Date: Wed, 21 Aug 2024 13:22:15 +0200 Subject: [PATCH 39/99] RDMA/rtrs: Register ib event handler Use ib_register_event_handler() to register event handlers for both client and server side. For now, all those handlers do, is to print type of incoming event. Signed-off-by: Grzegorz Prajsner Signed-off-by: Jack Wang Signed-off-by: Md Haris Iqbal Link: https://patch.msgid.link/20240821112217.41827-10-haris.iqbal@ionos.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 21 +++++++++++++++- drivers/infiniband/ulp/rtrs/rtrs-clt.h | 2 ++ drivers/infiniband/ulp/rtrs/rtrs-pri.h | 2 ++ drivers/infiniband/ulp/rtrs/rtrs-srv.c | 33 +++++++++++++++++++++++++- drivers/infiniband/ulp/rtrs/rtrs-srv.h | 2 ++ 5 files changed, 58 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 7c6d40380638..230e5f6c8c90 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -3149,8 +3149,20 @@ close_path: return err; } +void rtrs_clt_ib_event_handler(struct ib_event_handler *handler, + struct ib_event *ibevent) +{ + pr_info("Handling event: %s (%d).\n", ib_event_msg(ibevent->event), + ibevent->event); +} + + static int rtrs_clt_ib_dev_init(struct rtrs_ib_dev *dev) { + INIT_IB_EVENT_HANDLER(&dev->event_handler, dev->ib_dev, + rtrs_clt_ib_event_handler); + ib_register_event_handler(&dev->event_handler); + if (!(dev->ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) { pr_err("Memory registrations not supported.\n"); @@ -3160,8 +3172,15 @@ static int rtrs_clt_ib_dev_init(struct rtrs_ib_dev *dev) return 0; } +static void rtrs_clt_ib_dev_deinit(struct rtrs_ib_dev *dev) +{ + ib_unregister_event_handler(&dev->event_handler); +} + + static const struct rtrs_rdma_dev_pd_ops dev_pd_ops = { - .init = rtrs_clt_ib_dev_init + .init = rtrs_clt_ib_dev_init, + .deinit = rtrs_clt_ib_dev_deinit }; static int __init rtrs_client_init(void) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.h b/drivers/infiniband/ulp/rtrs/rtrs-clt.h index 45dac15825f4..0f57759b3080 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.h +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.h @@ -212,6 +212,8 @@ int rtrs_clt_remove_path_from_sysfs(struct rtrs_clt_path *path, void rtrs_clt_set_max_reconnect_attempts(struct rtrs_clt_sess *clt, int value); int rtrs_clt_get_max_reconnect_attempts(const struct rtrs_clt_sess *clt); void free_path(struct rtrs_clt_path *clt_path); +void rtrs_clt_ib_event_handler(struct ib_event_handler *handler, + struct ib_event *ibevent); /* rtrs-clt-stats.c */ diff --git a/drivers/infiniband/ulp/rtrs/rtrs-pri.h b/drivers/infiniband/ulp/rtrs/rtrs-pri.h index ab25619261d2..ef29bd483b5a 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-pri.h +++ b/drivers/infiniband/ulp/rtrs/rtrs-pri.h @@ -69,6 +69,7 @@ struct rtrs_ib_dev; struct rtrs_rdma_dev_pd_ops { int (*init)(struct rtrs_ib_dev *dev); + void (*deinit)(struct rtrs_ib_dev *dev); }; struct rtrs_rdma_dev_pd { @@ -84,6 +85,7 @@ struct rtrs_ib_dev { struct kref ref; struct list_head entry; struct rtrs_rdma_dev_pd *pool; + struct ib_event_handler event_handler; }; struct rtrs_con { diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index 05d15ff074bb..e83d95647852 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -26,7 +26,10 @@ MODULE_LICENSE("GPL"); #define DEFAULT_SESS_QUEUE_DEPTH 512 #define MAX_HDR_SIZE PAGE_SIZE -static struct rtrs_rdma_dev_pd dev_pd; +static const struct rtrs_rdma_dev_pd_ops dev_pd_ops; +static struct rtrs_rdma_dev_pd dev_pd = { + .ops = &dev_pd_ops +}; const struct class rtrs_dev_class = { .name = "rtrs-server", }; @@ -2269,6 +2272,34 @@ static int check_module_params(void) return 0; } +void rtrs_srv_ib_event_handler(struct ib_event_handler *handler, + struct ib_event *ibevent) +{ + pr_info("Handling event: %s (%d).\n", ib_event_msg(ibevent->event), + ibevent->event); +} + +static int rtrs_srv_ib_dev_init(struct rtrs_ib_dev *dev) +{ + INIT_IB_EVENT_HANDLER(&dev->event_handler, dev->ib_dev, + rtrs_srv_ib_event_handler); + ib_register_event_handler(&dev->event_handler); + + return 0; +} + +static void rtrs_srv_ib_dev_deinit(struct rtrs_ib_dev *dev) +{ + ib_unregister_event_handler(&dev->event_handler); +} + + +static const struct rtrs_rdma_dev_pd_ops dev_pd_ops = { + .init = rtrs_srv_ib_dev_init, + .deinit = rtrs_srv_ib_dev_deinit +}; + + static int __init rtrs_server_init(void) { int err; diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.h b/drivers/infiniband/ulp/rtrs/rtrs-srv.h index 5e325b82ff33..014f85681f37 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.h +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.h @@ -132,6 +132,8 @@ struct rtrs_srv_ib_ctx { extern const struct class rtrs_dev_class; void close_path(struct rtrs_srv_path *srv_path); +void rtrs_srv_ib_event_handler(struct ib_event_handler *handler, + struct ib_event *ibevent); static inline void rtrs_srv_update_rdma_stats(struct rtrs_srv_stats *s, size_t size, int d) From bab9f8db4295c09df6fea5d197798186d01b8226 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Wed, 21 Aug 2024 13:22:16 +0200 Subject: [PATCH 40/99] RDMA/rtrs-clt: Do local invalidate after write io completion Switch local invalidate after write io completion avoid the chain usage of WR, this fixed the local protection error on LOCAL INVALIDATE WR. Signed-off-by: Jack Wang Signed-off-by: Md Haris Iqbal Signed-off-by: Grzegorz Prajsner Link: https://patch.msgid.link/20240821112217.41827-11-haris.iqbal@ionos.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 230e5f6c8c90..fb548d6a0aae 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -395,9 +395,9 @@ static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno, /* * We are here to invalidate read/write requests * ourselves. In normal scenario server should - * send INV for all read requests, we do chained local - * invalidate for write requests, but - * we are here, thus two things could happen: + * send INV for all read requests, we do local + * invalidate for write requests ourselves, but + * we are here, thus three things could happen: * * 1. this is failover, when errno != 0 * and can_wait == 1, @@ -405,6 +405,9 @@ static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno, * 2. something totally bad happened and * server forgot to send INV, so we * should do that ourselves. + * + * 3. write request finishes, we need to do local + * invalidate */ if (can_wait) { @@ -1085,7 +1088,6 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req) int ret, count = 0; u32 imm, buf_id; struct ib_reg_wr rwr; - struct ib_send_wr inv_wr; struct ib_send_wr *wr = NULL; bool fr_en = false; @@ -1126,13 +1128,6 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req) req->sg_cnt, req->dir); return ret; } - inv_wr = (struct ib_send_wr) { - .opcode = IB_WR_LOCAL_INV, - .wr_cqe = &req->inv_cqe, - .send_flags = IB_SEND_SIGNALED, - .ex.invalidate_rkey = req->mr->rkey, - }; - req->inv_cqe.done = rtrs_clt_inv_rkey_done; rwr = (struct ib_reg_wr) { .wr.opcode = IB_WR_REG_MR, .wr.wr_cqe = &fast_reg_cqe, @@ -1142,7 +1137,6 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req) }; wr = &rwr.wr; fr_en = true; - refcount_inc(&req->ref); req->mr->need_inval = true; } /* @@ -1153,7 +1147,7 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req) ret = rtrs_post_rdma_write_sg(req->con, req, rbuf, fr_en, count, req->usr_len + sizeof(*msg), - imm, wr, &inv_wr); + imm, wr, NULL); if (ret) { rtrs_err_rl(s, "Write request failed: error=%d path=%s [%s:%u]\n", From e5bba9e0276441d53e976a932e728e8ced019a8d Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Wed, 21 Aug 2024 13:22:17 +0200 Subject: [PATCH 41/99] RDMA/rtrs-clt: Remove an extra space No functional changes. Signed-off-by: Jack Wang Signed-off-by: Alexei Pastuchov Signed-off-by: Grzegorz Prajsner Signed-off-by: Md Haris Iqbal Link: https://patch.msgid.link/20240821112217.41827-12-haris.iqbal@ionos.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index fb548d6a0aae..71387811b281 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -1208,7 +1208,7 @@ static int rtrs_clt_read_req(struct rtrs_clt_io_req *req) ret = rtrs_map_sg_fr(req, count); if (ret < 0) { rtrs_err_rl(s, - "Read request failed, failed to map fast reg. data, err: %d\n", + "Read request failed, failed to map fast reg. data, err: %d\n", ret); ib_dma_unmap_sg(dev->ib_dev, req->sglist, req->sg_cnt, req->dir); From 2d10b05bcef685572ce8962ecb0936952915d954 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Sat, 24 Aug 2024 17:16:29 +0800 Subject: [PATCH 42/99] RDMA/cxgb4: Remove unused declarations Since commit be4c9bad9d0e ("MAINTAINERS: Add cxgb4 and iw_cxgb4 entries") c4iw_post_terminate() declaration is not used anymore. And other declarations were never implemented since introduction in commit cfdda9d76436 ("RDMA/cxgb4: Add driver for Chelsio T4 RNIC"). Signed-off-by: Yue Haibing Link: https://patch.msgid.link/20240824091629.3659565-1-yuehaibing@huawei.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index bedd5ca96fdd..5b3007acaa1f 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -972,7 +972,6 @@ u32 c4iw_get_resource(struct c4iw_id_table *id_table); void c4iw_put_resource(struct c4iw_id_table *id_table, u32 entry); int c4iw_init_resource(struct c4iw_rdev *rdev, u32 nr_tpt, u32 nr_pdid, u32 nr_srqt); -int c4iw_init_ctrl_qp(struct c4iw_rdev *rdev); int c4iw_pblpool_create(struct c4iw_rdev *rdev); int c4iw_rqtpool_create(struct c4iw_rdev *rdev); int c4iw_ocqp_pool_create(struct c4iw_rdev *rdev); @@ -980,7 +979,6 @@ void c4iw_pblpool_destroy(struct c4iw_rdev *rdev); void c4iw_rqtpool_destroy(struct c4iw_rdev *rdev); void c4iw_ocqp_pool_destroy(struct c4iw_rdev *rdev); void c4iw_destroy_resource(struct c4iw_resource *rscp); -int c4iw_destroy_ctrl_qp(struct c4iw_rdev *rdev); void c4iw_register_device(struct work_struct *work); void c4iw_unregister_device(struct c4iw_dev *dev); int __init c4iw_cm_init(void); @@ -1042,8 +1040,6 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp); int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count); int c4iw_flush_sq(struct c4iw_qp *qhp); int c4iw_ev_handler(struct c4iw_dev *rnicp, u32 qid); -u16 c4iw_rqes_posted(struct c4iw_qp *qhp); -int c4iw_post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe); u32 c4iw_get_cqid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx); void c4iw_put_cqid(struct c4iw_rdev *rdev, u32 qid, struct c4iw_dev_ucontext *uctx); From e012316d83bdf52061317cc361ca07c9580883d9 Mon Sep 17 00:00:00 2001 From: Shen Lichuan Date: Wed, 28 Aug 2024 16:27:20 +0800 Subject: [PATCH 43/99] RDMA/rdmavt: Convert to use ERR_CAST() As opposed to open-code, using the ERR_CAST macro clearly indicates that this is a pointer to an error value and a type conversion was performed. Signed-off-by: Shen Lichuan Link: https://patch.msgid.link/20240828082720.33231-1-shenlichuan@vivo.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rdmavt/mr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 7a9afd5231d5..5ed5cfc2b280 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -348,13 +348,13 @@ struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, umem = ib_umem_get(pd->device, start, length, mr_access_flags); if (IS_ERR(umem)) - return (void *)umem; + return ERR_CAST(umem); n = ib_umem_num_pages(umem); mr = __rvt_alloc_mr(n, pd); if (IS_ERR(mr)) { - ret = (struct ib_mr *)mr; + ret = ERR_CAST(mr); goto bail_umem; } @@ -542,7 +542,7 @@ struct ib_mr *rvt_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, mr = __rvt_alloc_mr(max_num_sg, pd); if (IS_ERR(mr)) - return (struct ib_mr *)mr; + return ERR_CAST(mr); return &mr->ibmr; } From 640c2cf84e1de62e6bb0738dc2128d5506e7e5bc Mon Sep 17 00:00:00 2001 From: Hongguang Gao Date: Thu, 29 Aug 2024 08:34:03 -0700 Subject: [PATCH 44/99] RDMA/bnxt_re: Get the toggle bits from SRQ events SRQ arming requires the toggle bits received from hardware. Get the toggle bits from SRQ notification for the gen p7 adapters. This value will be zero for the older adapters. Signed-off-by: Hongguang Gao Signed-off-by: Chandramohan Akula Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1724945645-14989-2-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.h | 1 + drivers/infiniband/hw/bnxt_re/qplib_fp.c | 11 +++++++++++ drivers/infiniband/hw/bnxt_re/qplib_fp.h | 1 + 3 files changed, 13 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index bbbec4b1d201..060bf62eceba 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -77,6 +77,7 @@ struct bnxt_re_srq { struct bnxt_qplib_srq qplib_srq; struct ib_umem *umem; spinlock_t lock; /* protect srq */ + void *uctx_srq_page; }; struct bnxt_re_qp { diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 2810ffe3394b..42e98e5f94cb 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -54,6 +54,10 @@ #include "qplib_rcfw.h" #include "qplib_sp.h" #include "qplib_fp.h" +#include +#include "bnxt_ulp.h" +#include "bnxt_re.h" +#include "ib_verbs.h" static void __clean_cq(struct bnxt_qplib_cq *cq, u64 qp); @@ -347,6 +351,7 @@ static void bnxt_qplib_service_nq(struct tasklet_struct *t) case NQ_BASE_TYPE_SRQ_EVENT: { struct bnxt_qplib_srq *srq; + struct bnxt_re_srq *srq_p; struct nq_srq_event *nqsrqe = (struct nq_srq_event *)nqe; @@ -354,6 +359,12 @@ static void bnxt_qplib_service_nq(struct tasklet_struct *t) q_handle |= (u64)le32_to_cpu(nqsrqe->srq_handle_high) << 32; srq = (struct bnxt_qplib_srq *)q_handle; + srq->toggle = (le16_to_cpu(nqe->info10_type) & NQ_CN_TOGGLE_MASK) + >> NQ_CN_TOGGLE_SFT; + srq->dbinfo.toggle = srq->toggle; + srq_p = container_of(srq, struct bnxt_re_srq, qplib_srq); + if (srq_p->uctx_srq_page) + *((u32 *)srq_p->uctx_srq_page) = srq->toggle; bnxt_qplib_armen_db(&srq->dbinfo, DBC_DBC_TYPE_SRQ_ARMENA); if (nq->srqn_handler(nq, diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index 2e7a4fd651b8..b62df8701950 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -105,6 +105,7 @@ struct bnxt_qplib_srq { struct bnxt_qplib_sg_info sg_info; u16 eventq_hw_ring_id; spinlock_t lock; /* protect SRQE link list */ + u8 toggle; }; struct bnxt_qplib_sge { From b4207630e0040b2f8d59ee28bb645771db31d37f Mon Sep 17 00:00:00 2001 From: Chandramohan Akula Date: Thu, 29 Aug 2024 08:34:04 -0700 Subject: [PATCH 45/99] RDMA/bnxt_re: Refactor the BNXT_RE_METHOD_GET_TOGGLE_MEM method Refactor the code in this function to have common code. This is used in subsequent patches. Signed-off-by: Chandramohan Akula Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1724945645-14989-3-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 6ce1db90de88..eafbee4af067 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -4519,12 +4519,12 @@ static int UVERBS_HANDLER(BNXT_RE_METHOD_GET_TOGGLE_MEM)(struct uverbs_attr_bund struct bnxt_re_ucontext *uctx; struct ib_ucontext *ib_uctx; struct bnxt_re_dev *rdev; + u32 length = PAGE_SIZE; struct bnxt_re_cq *cq; u64 mem_offset; + u32 offset = 0; u64 addr = 0; - u32 length; - u32 offset; - u32 cq_id; + u32 res_id; int err; ib_uctx = ib_uverbs_get_ucontext(attrs); @@ -4537,21 +4537,17 @@ static int UVERBS_HANDLER(BNXT_RE_METHOD_GET_TOGGLE_MEM)(struct uverbs_attr_bund uctx = container_of(ib_uctx, struct bnxt_re_ucontext, ib_uctx); rdev = uctx->rdev; + err = uverbs_copy_from(&res_id, attrs, BNXT_RE_TOGGLE_MEM_RES_ID); + if (err) + return err; switch (res_type) { case BNXT_RE_CQ_TOGGLE_MEM: - err = uverbs_copy_from(&cq_id, attrs, BNXT_RE_TOGGLE_MEM_RES_ID); - if (err) - return err; - - cq = bnxt_re_search_for_cq(rdev, cq_id); + cq = bnxt_re_search_for_cq(rdev, res_id); if (!cq) return -EINVAL; - length = PAGE_SIZE; addr = (u64)cq->uctx_cq_page; - mmap_flag = BNXT_RE_MMAP_TOGGLE_PAGE; - offset = 0; break; case BNXT_RE_SRQ_TOGGLE_MEM: break; From 181028a0d84cdcc7ac86d05cc49eaa416ce85c8b Mon Sep 17 00:00:00 2001 From: Chandramohan Akula Date: Thu, 29 Aug 2024 08:34:05 -0700 Subject: [PATCH 46/99] RDMA/bnxt_re: Share a page to expose per SRQ info with userspace Gen P7 adapters needs to share a toggle bits information received in kernel driver with the user space. User space needs this info to arm the SRQ. User space application can get this page using the UAPI routines. Library will mmap this page and get the toggle bits to be used in the next ARM Doorbell. Uses a hash list to map the SRQ structure from the SRQ ID. SRQ structure is retrieved from the hash list while the library calls the UAPI routine to get the toggle page mapping. Currently the full page is mapped per SRQ. This can be optimized to enable multiple SRQs from the same application share the same page and different offsets in the page Signed-off-by: Chandramohan Akula Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1724945645-14989-4-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 2 ++ drivers/infiniband/hw/bnxt_re/ib_verbs.c | 34 +++++++++++++++++++++++- drivers/infiniband/hw/bnxt_re/ib_verbs.h | 1 + drivers/infiniband/hw/bnxt_re/main.c | 6 ++++- include/uapi/rdma/bnxt_re-abi.h | 6 +++++ 5 files changed, 47 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 0912d2fa9634..2be9a62d230f 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -141,6 +141,7 @@ struct bnxt_re_pacing { #define BNXT_RE_GRC_FIFO_REG_BASE 0x2000 #define MAX_CQ_HASH_BITS (16) +#define MAX_SRQ_HASH_BITS (16) struct bnxt_re_dev { struct ib_device ibdev; struct list_head list; @@ -196,6 +197,7 @@ struct bnxt_re_dev { struct work_struct dbq_fifo_check_work; struct delayed_work dbq_pacing_work; DECLARE_HASHTABLE(cq_hash, MAX_CQ_HASH_BITS); + DECLARE_HASHTABLE(srq_hash, MAX_SRQ_HASH_BITS); }; #define to_bnxt_re_dev(ptr, member) \ diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index eafbee4af067..f9f944a6094a 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -1707,6 +1707,10 @@ int bnxt_re_destroy_srq(struct ib_srq *ib_srq, struct ib_udata *udata) if (qplib_srq->cq) nq = qplib_srq->cq->nq; + if (rdev->chip_ctx->modes.toggle_bits & BNXT_QPLIB_SRQ_TOGGLE_BIT) { + free_page((unsigned long)srq->uctx_srq_page); + hash_del(&srq->hash_entry); + } bnxt_qplib_destroy_srq(&rdev->qplib_res, qplib_srq); ib_umem_release(srq->umem); atomic_dec(&rdev->stats.res.srq_count); @@ -1811,9 +1815,18 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq, } if (udata) { - struct bnxt_re_srq_resp resp; + struct bnxt_re_srq_resp resp = {}; resp.srqid = srq->qplib_srq.id; + if (rdev->chip_ctx->modes.toggle_bits & BNXT_QPLIB_SRQ_TOGGLE_BIT) { + hash_add(rdev->srq_hash, &srq->hash_entry, srq->qplib_srq.id); + srq->uctx_srq_page = (void *)get_zeroed_page(GFP_KERNEL); + if (!srq->uctx_srq_page) { + rc = -ENOMEM; + goto fail; + } + resp.comp_mask |= BNXT_RE_SRQ_TOGGLE_PAGE_SUPPORT; + } rc = ib_copy_to_udata(udata, &resp, sizeof(resp)); if (rc) { ibdev_err(&rdev->ibdev, "SRQ copy to udata failed!"); @@ -4291,6 +4304,19 @@ static struct bnxt_re_cq *bnxt_re_search_for_cq(struct bnxt_re_dev *rdev, u32 cq return cq; } +static struct bnxt_re_srq *bnxt_re_search_for_srq(struct bnxt_re_dev *rdev, u32 srq_id) +{ + struct bnxt_re_srq *srq = NULL, *tmp_srq; + + hash_for_each_possible(rdev->srq_hash, tmp_srq, hash_entry, srq_id) { + if (tmp_srq->qplib_srq.id == srq_id) { + srq = tmp_srq; + break; + } + } + return srq; +} + /* Helper function to mmap the virtual memory from user app */ int bnxt_re_mmap(struct ib_ucontext *ib_uctx, struct vm_area_struct *vma) { @@ -4519,6 +4545,7 @@ static int UVERBS_HANDLER(BNXT_RE_METHOD_GET_TOGGLE_MEM)(struct uverbs_attr_bund struct bnxt_re_ucontext *uctx; struct ib_ucontext *ib_uctx; struct bnxt_re_dev *rdev; + struct bnxt_re_srq *srq; u32 length = PAGE_SIZE; struct bnxt_re_cq *cq; u64 mem_offset; @@ -4550,6 +4577,11 @@ static int UVERBS_HANDLER(BNXT_RE_METHOD_GET_TOGGLE_MEM)(struct uverbs_attr_bund addr = (u64)cq->uctx_cq_page; break; case BNXT_RE_SRQ_TOGGLE_MEM: + srq = bnxt_re_search_for_srq(rdev, res_id); + if (!srq) + return -EINVAL; + + addr = (u64)srq->uctx_srq_page; break; default: diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index 060bf62eceba..b789e47ec97a 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -78,6 +78,7 @@ struct bnxt_re_srq { struct ib_umem *umem; spinlock_t lock; /* protect srq */ void *uctx_srq_page; + struct hlist_node hash_entry; }; struct bnxt_re_qp { diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 31ba89cffe9d..16a84ca1ce48 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -139,8 +139,10 @@ static void bnxt_re_set_drv_mode(struct bnxt_re_dev *rdev) if (bnxt_re_hwrm_qcaps(rdev)) dev_err(rdev_to_dev(rdev), "Failed to query hwrm qcaps\n"); - if (bnxt_qplib_is_chip_gen_p7(rdev->chip_ctx)) + if (bnxt_qplib_is_chip_gen_p7(rdev->chip_ctx)) { cctx->modes.toggle_bits |= BNXT_QPLIB_CQ_TOGGLE_BIT; + cctx->modes.toggle_bits |= BNXT_QPLIB_SRQ_TOGGLE_BIT; + } } static void bnxt_re_destroy_chip_ctx(struct bnxt_re_dev *rdev) @@ -1771,6 +1773,8 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev) bnxt_re_vf_res_config(rdev); } hash_init(rdev->cq_hash); + if (rdev->chip_ctx->modes.toggle_bits & BNXT_QPLIB_SRQ_TOGGLE_BIT) + hash_init(rdev->srq_hash); return 0; free_sctx: diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h index 6821002931c8..faa9d62b3b30 100644 --- a/include/uapi/rdma/bnxt_re-abi.h +++ b/include/uapi/rdma/bnxt_re-abi.h @@ -141,8 +141,14 @@ struct bnxt_re_srq_req { __aligned_u64 srq_handle; }; +enum bnxt_re_srq_mask { + BNXT_RE_SRQ_TOGGLE_PAGE_SUPPORT = 0x1, +}; + struct bnxt_re_srq_resp { __u32 srqid; + __u32 rsvd; /* padding */ + __aligned_u64 comp_mask; }; enum bnxt_re_shpg_offt { From 9e517a8e9d9a303bf9bde35e5c5374795544c152 Mon Sep 17 00:00:00 2001 From: Long Li Date: Fri, 30 Aug 2024 08:16:32 -0700 Subject: [PATCH 47/99] RDMA/mana_ib: use the correct page table index based on hardware page size MANA hardware uses 4k page size. When calculating the page table index, it should use the hardware page size, not the system page size. Cc: stable@vger.kernel.org Fixes: 0266a177631d ("RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter") Signed-off-by: Long Li Link: https://patch.msgid.link/1725030993-16213-1-git-send-email-longli@linuxonhyperv.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index d13abc954d2a..f68f54aea820 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -383,7 +383,7 @@ static int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem create_req->length = umem->length; create_req->offset_in_page = ib_umem_dma_offset(umem, page_sz); - create_req->gdma_page_type = order_base_2(page_sz) - PAGE_SHIFT; + create_req->gdma_page_type = order_base_2(page_sz) - MANA_PAGE_SHIFT; create_req->page_count = num_pages_total; ibdev_dbg(&dev->ib_dev, "size_dma_region %lu num_pages_total %lu\n", From 4a3b99bc04e501b816db78f70064e26a01257910 Mon Sep 17 00:00:00 2001 From: Long Li Date: Fri, 30 Aug 2024 08:16:33 -0700 Subject: [PATCH 48/99] RDMA/mana_ib: use the correct page size for mapping user-mode doorbell page When mapping doorbell page from user-mode, the driver should use the system page size as this memory is allocated via mmap() from user-mode. Cc: stable@vger.kernel.org Fixes: 0266a177631d ("RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter") Signed-off-by: Long Li Link: https://patch.msgid.link/1725030993-16213-2-git-send-email-longli@linuxonhyperv.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index f68f54aea820..67c2d43135a8 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -511,13 +511,13 @@ int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) PAGE_SHIFT; prot = pgprot_writecombine(vma->vm_page_prot); - ret = rdma_user_mmap_io(ibcontext, vma, pfn, gc->db_page_size, prot, + ret = rdma_user_mmap_io(ibcontext, vma, pfn, PAGE_SIZE, prot, NULL); if (ret) ibdev_dbg(ibdev, "can't rdma_user_mmap_io ret %d\n", ret); else - ibdev_dbg(ibdev, "mapped I/O pfn 0x%llx page_size %u, ret %d\n", - pfn, gc->db_page_size, ret); + ibdev_dbg(ibdev, "mapped I/O pfn 0x%llx page_size %lu, ret %d\n", + pfn, PAGE_SIZE, ret); return ret; } From 543b455c6e9cf08b9a96a06a4680a1ffcb299701 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Sun, 1 Sep 2024 22:52:28 -0700 Subject: [PATCH 49/99] RDMA/bnxt_re: Update HW interface headers Updating the HW structures for the pcie relax ordering support. Newly added interface structures will be used in the followup patch. Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1725256351-12751-2-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/roce_hsi.h | 36 ++++++++++++++++++------ 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index 042530969505..3ec895284e49 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -409,7 +409,7 @@ struct creq_deinitialize_fw_resp { u8 reserved48[6]; }; -/* cmdq_create_qp (size:768b/96B) */ +/* cmdq_create_qp (size:832b/104B) */ struct cmdq_create_qp { u8 opcode; #define CMDQ_CREATE_QP_OPCODE_CREATE_QP 0x1UL @@ -430,8 +430,11 @@ struct cmdq_create_qp { #define CMDQ_CREATE_QP_QP_FLAGS_OPTIMIZED_TRANSMIT_ENABLED 0x20UL #define CMDQ_CREATE_QP_QP_FLAGS_RESPONDER_UD_CQE_WITH_CFA 0x40UL #define CMDQ_CREATE_QP_QP_FLAGS_EXT_STATS_ENABLED 0x80UL + #define CMDQ_CREATE_QP_QP_FLAGS_EXPRESS_MODE_ENABLED 0x100UL + #define CMDQ_CREATE_QP_QP_FLAGS_STEERING_TAG_VALID 0x200UL + #define CMDQ_CREATE_QP_QP_FLAGS_RDMA_READ_OR_ATOMICS_USED 0x400UL #define CMDQ_CREATE_QP_QP_FLAGS_LAST \ - CMDQ_CREATE_QP_QP_FLAGS_EXT_STATS_ENABLED + CMDQ_CREATE_QP_QP_FLAGS_RDMA_READ_OR_ATOMICS_USED u8 type; #define CMDQ_CREATE_QP_TYPE_RC 0x2UL #define CMDQ_CREATE_QP_TYPE_UD 0x4UL @@ -492,6 +495,9 @@ struct cmdq_create_qp { __le64 rq_pbl; __le64 irrq_addr; __le64 orrq_addr; + __le32 request_xid; + __le16 steering_tag; + __le16 reserved16; }; /* creq_create_qp_resp (size:128b/16B) */ @@ -972,13 +978,14 @@ struct creq_query_qp_extend_resp_sb_tlv { __le16 reserved_16; }; -/* cmdq_create_srq (size:384b/48B) */ +/* cmdq_create_srq (size:448b/56B) */ struct cmdq_create_srq { u8 opcode; #define CMDQ_CREATE_SRQ_OPCODE_CREATE_SRQ 0x5UL #define CMDQ_CREATE_SRQ_OPCODE_LAST CMDQ_CREATE_SRQ_OPCODE_CREATE_SRQ u8 cmd_size; __le16 flags; + #define CMDQ_CREATE_SRQ_FLAGS_STEERING_TAG_VALID 0x1UL __le16 cookie; u8 resp_size; u8 reserved8; @@ -1012,6 +1019,8 @@ struct cmdq_create_srq { __le32 dpi; __le32 pd_id; __le64 pbl; + __le16 steering_tag; + u8 reserved48[6]; }; /* creq_create_srq_resp (size:128b/16B) */ @@ -1118,7 +1127,7 @@ struct creq_query_srq_resp_sb { __le32 data[4]; }; -/* cmdq_create_cq (size:384b/48B) */ +/* cmdq_create_cq (size:448b/56B) */ struct cmdq_create_cq { u8 opcode; #define CMDQ_CREATE_CQ_OPCODE_CREATE_CQ 0x9UL @@ -1126,6 +1135,8 @@ struct cmdq_create_cq { u8 cmd_size; __le16 flags; #define CMDQ_CREATE_CQ_FLAGS_DISABLE_CQ_OVERFLOW_DETECTION 0x1UL + #define CMDQ_CREATE_CQ_FLAGS_STEERING_TAG_VALID 0x2UL + #define CMDQ_CREATE_CQ_FLAGS_INFINITE_CQ_MODE 0x4UL __le16 cookie; u8 resp_size; u8 reserved8; @@ -1157,6 +1168,8 @@ struct cmdq_create_cq { __le32 dpi; __le32 cq_size; __le64 pbl; + __le16 steering_tag; + u8 reserved48[6]; }; /* creq_create_cq_resp (size:128b/16B) */ @@ -1288,11 +1301,12 @@ struct cmdq_allocate_mrw { #define CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2A 0x3UL #define CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B 0x4UL #define CMDQ_ALLOCATE_MRW_MRW_FLAGS_LAST CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B - #define CMDQ_ALLOCATE_MRW_UNUSED4_MASK 0xf0UL - #define CMDQ_ALLOCATE_MRW_UNUSED4_SFT 4 + #define CMDQ_ALLOCATE_MRW_STEERING_TAG_VALID 0x10UL + #define CMDQ_ALLOCATE_MRW_UNUSED4_MASK 0xe0UL + #define CMDQ_ALLOCATE_MRW_UNUSED4_SFT 5 u8 access; #define CMDQ_ALLOCATE_MRW_ACCESS_CONSUMER_OWNED_KEY 0x20UL - __le16 unused16; + __le16 steering_tag; __le32 pd_id; }; @@ -1359,14 +1373,16 @@ struct creq_deallocate_key_resp { __le32 bound_window_info; }; -/* cmdq_register_mr (size:384b/48B) */ +/* cmdq_register_mr (size:448b/56B) */ struct cmdq_register_mr { u8 opcode; #define CMDQ_REGISTER_MR_OPCODE_REGISTER_MR 0xfUL #define CMDQ_REGISTER_MR_OPCODE_LAST CMDQ_REGISTER_MR_OPCODE_REGISTER_MR u8 cmd_size; __le16 flags; - #define CMDQ_REGISTER_MR_FLAGS_ALLOC_MR 0x1UL + #define CMDQ_REGISTER_MR_FLAGS_ALLOC_MR 0x1UL + #define CMDQ_REGISTER_MR_FLAGS_STEERING_TAG_VALID 0x2UL + #define CMDQ_REGISTER_MR_FLAGS_ENABLE_RO 0x4UL __le16 cookie; u8 resp_size; u8 reserved8; @@ -1415,6 +1431,8 @@ struct cmdq_register_mr { __le64 pbl; __le64 va; __le64 mr_size; + __le16 steering_tag; + u8 reserved48[6]; }; /* creq_register_mr_resp (size:128b/16B) */ From b98d96971908b71be394c43d87d037b5fb4e6d8a Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Sun, 1 Sep 2024 22:52:29 -0700 Subject: [PATCH 50/99] RDMA/bnxt_re: Rename a variable Renaming flags to access_flags for clarity. Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1725256351-12751-3-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 8 ++++---- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 4 ++-- drivers/infiniband/hw/bnxt_re/qplib_sp.h | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index f9f944a6094a..f26b7f82dfbd 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -517,7 +517,7 @@ static int bnxt_re_create_fence_mr(struct bnxt_re_pd *pd) mr->rdev = rdev; mr->qplib_mr.pd = &pd->qplib_pd; mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR; - mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags); + mr->qplib_mr.access_flags = __from_ib_access_flags(mr_access_flags); rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); if (rc) { ibdev_err(&rdev->ibdev, "Failed to alloc fence-HW-MR\n"); @@ -3881,7 +3881,7 @@ struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *ib_pd, int mr_access_flags) mr->rdev = rdev; mr->qplib_mr.pd = &pd->qplib_pd; - mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags); + mr->qplib_mr.access_flags = __from_ib_access_flags(mr_access_flags); mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR; /* Allocate and register 0 as the address */ @@ -3981,7 +3981,7 @@ struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type type, mr->rdev = rdev; mr->qplib_mr.pd = &pd->qplib_pd; - mr->qplib_mr.flags = BNXT_QPLIB_FR_PMR; + mr->qplib_mr.access_flags = BNXT_QPLIB_FR_PMR; mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR; rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); @@ -4098,7 +4098,7 @@ static struct ib_mr *__bnxt_re_user_reg_mr(struct ib_pd *ib_pd, u64 length, u64 mr->rdev = rdev; mr->qplib_mr.pd = &pd->qplib_pd; - mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags); + mr->qplib_mr.access_flags = __from_ib_access_flags(mr_access_flags); mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_MR; rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index ca2aa35e6eec..c26e8f5b6729 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -544,7 +544,7 @@ int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw) req.pd_id = cpu_to_le32(mrw->pd->id); req.mrw_flags = mrw->type; if ((mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR && - mrw->flags & BNXT_QPLIB_FR_PMR) || + mrw->access_flags & BNXT_QPLIB_FR_PMR) || mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2A || mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B) req.access = CMDQ_ALLOCATE_MRW_ACCESS_CONSUMER_OWNED_KEY; @@ -656,7 +656,7 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, req.log2_pbl_pg_size = cpu_to_le16(((ilog2(PAGE_SIZE) << CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_SFT) & CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_MASK)); - req.access = (mr->flags & 0xFFFF); + req.access = (mr->access_flags & 0xFFFF); req.va = cpu_to_le64(mr->va); req.key = cpu_to_le32(mr->lkey); req.mr_size = cpu_to_le64(mr->total_size); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index a633e2a9aa94..06e74b6da434 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -109,7 +109,7 @@ struct bnxt_qplib_ah { struct bnxt_qplib_mrw { struct bnxt_qplib_pd *pd; int type; - u32 flags; + u32 access_flags; #define BNXT_QPLIB_FR_PMR 0x80000000 u32 lkey; u32 rkey; From f786eebbbefa0c080d45533c5e0f66d500268961 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Sun, 1 Sep 2024 22:52:30 -0700 Subject: [PATCH 51/99] RDMA/bnxt_re: Avoid an extra hwrm per MR creation Firmware now have a new mr registration command where both MR allocation and registration can be done in a single hwrm command. Driver has to issue this new hwrm command whenever the support flag is set. This reduces the number of hwrm issued per MR creation and speed up the MR creation. Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1725256351-12751-4-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 36 ++++++++++++++--------- drivers/infiniband/hw/bnxt_re/qplib_res.h | 5 ++++ drivers/infiniband/hw/bnxt_re/qplib_sp.c | 8 +++++ drivers/infiniband/hw/bnxt_re/qplib_sp.h | 1 + 4 files changed, 36 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index f26b7f82dfbd..13e3d71d256a 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -518,14 +518,18 @@ static int bnxt_re_create_fence_mr(struct bnxt_re_pd *pd) mr->qplib_mr.pd = &pd->qplib_pd; mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR; mr->qplib_mr.access_flags = __from_ib_access_flags(mr_access_flags); - rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); - if (rc) { - ibdev_err(&rdev->ibdev, "Failed to alloc fence-HW-MR\n"); - goto fail; - } + if (!_is_alloc_mr_unified(rdev->dev_attr.dev_cap_flags)) { + rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); + if (rc) { + ibdev_err(&rdev->ibdev, "Failed to alloc fence-HW-MR\n"); + goto fail; + } - /* Register MR */ - mr->ib_mr.lkey = mr->qplib_mr.lkey; + /* Register MR */ + mr->ib_mr.lkey = mr->qplib_mr.lkey; + } else { + mr->qplib_mr.flags = CMDQ_REGISTER_MR_FLAGS_ALLOC_MR; + } mr->qplib_mr.va = (u64)(unsigned long)fence->va; mr->qplib_mr.total_size = BNXT_RE_FENCE_BYTES; rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, NULL, @@ -4101,14 +4105,18 @@ static struct ib_mr *__bnxt_re_user_reg_mr(struct ib_pd *ib_pd, u64 length, u64 mr->qplib_mr.access_flags = __from_ib_access_flags(mr_access_flags); mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_MR; - rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); - if (rc) { - ibdev_err(&rdev->ibdev, "Failed to allocate MR rc = %d", rc); - rc = -EIO; - goto free_mr; + if (!_is_alloc_mr_unified(rdev->dev_attr.dev_cap_flags)) { + rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); + if (rc) { + ibdev_err(&rdev->ibdev, "Failed to allocate MR rc = %d", rc); + rc = -EIO; + goto free_mr; + } + /* The fixed portion of the rkey is the same as the lkey */ + mr->ib_mr.rkey = mr->qplib_mr.rkey; + } else { + mr->qplib_mr.flags = CMDQ_REGISTER_MR_FLAGS_ALLOC_MR; } - /* The fixed portion of the rkey is the same as the lkey */ - mr->ib_mr.rkey = mr->qplib_mr.rkey; mr->ib_umem = umem; mr->qplib_mr.va = virt_addr; mr->qplib_mr.total_size = length; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index a0f78cde314f..b452b2f46ceb 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -565,4 +565,9 @@ static inline u8 bnxt_qplib_dbr_pacing_en(struct bnxt_qplib_chip_ctx *cctx) return cctx->modes.dbr_pacing; } +static inline bool _is_alloc_mr_unified(u16 dev_cap_flags) +{ + return dev_cap_flags & CREQ_QUERY_FUNC_RESP_SB_MR_REGISTER_ALLOC; +} + #endif /* __BNXT_QPLIB_RES_H__ */ diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index c26e8f5b6729..4f75e7e5bcf7 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -659,6 +659,9 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, req.access = (mr->access_flags & 0xFFFF); req.va = cpu_to_le64(mr->va); req.key = cpu_to_le32(mr->lkey); + if (_is_alloc_mr_unified(res->dattr->dev_cap_flags)) + req.key = cpu_to_le32(mr->pd->id); + req.flags = cpu_to_le16(mr->flags); req.mr_size = cpu_to_le64(mr->total_size); bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), @@ -667,6 +670,11 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, if (rc) goto fail; + if (_is_alloc_mr_unified(res->dattr->dev_cap_flags)) { + mr->lkey = le32_to_cpu(resp.xid); + mr->rkey = mr->lkey; + } + return 0; fail: diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index 06e74b6da434..4ce44aabfdc1 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -117,6 +117,7 @@ struct bnxt_qplib_mrw { u64 va; u64 total_size; u32 npages; + u16 flags; u64 mr_handle; struct bnxt_qplib_hwq hwq; }; From dc116b7fddbdad000b6f2a8ca41d1fe5371b403c Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Sun, 1 Sep 2024 22:52:31 -0700 Subject: [PATCH 52/99] RDMA/bnxt_re: Add support for MR Relaxed Ordering Some of the adapters support Relaxed Ordering for the MRs. Driver queries support for Memory region relax ordering support from firmware and set relax ordering bit in REGISTER_MR request, if the users request for the support. Also, this is supported only if the PCIe device has enabled relaxed ordering attribute. Reviewed-by: Chandramohan Akula Reviewed-by: Selvin Xavier Reviewed-by: Vijay Kumar Mandadapu Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1725256351-12751-5-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 14 ++++++++++++++ drivers/infiniband/hw/bnxt_re/qplib_res.h | 5 +++++ 2 files changed, 19 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 13e3d71d256a..82c1f3b2f825 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -115,6 +115,14 @@ static enum ib_access_flags __to_ib_access_flags(int qflags) return iflags; }; +static void bnxt_re_check_and_set_relaxed_ordering(struct bnxt_re_dev *rdev, + struct bnxt_qplib_mrw *qplib_mr) +{ + if (_is_relaxed_ordering_supported(rdev->dev_attr.dev_cap_flags2) && + pcie_relaxed_ordering_enabled(rdev->en_dev->pdev)) + qplib_mr->flags |= CMDQ_REGISTER_MR_FLAGS_ENABLE_RO; +} + static int bnxt_re_build_sgl(struct ib_sge *ib_sg_list, struct bnxt_qplib_sge *sg_list, int num) { @@ -3888,6 +3896,9 @@ struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *ib_pd, int mr_access_flags) mr->qplib_mr.access_flags = __from_ib_access_flags(mr_access_flags); mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR; + if (mr_access_flags & IB_ACCESS_RELAXED_ORDERING) + bnxt_re_check_and_set_relaxed_ordering(rdev, &mr->qplib_mr); + /* Allocate and register 0 as the address */ rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); if (rc) @@ -4121,6 +4132,9 @@ static struct ib_mr *__bnxt_re_user_reg_mr(struct ib_pd *ib_pd, u64 length, u64 mr->qplib_mr.va = virt_addr; mr->qplib_mr.total_size = length; + if (mr_access_flags & IB_ACCESS_RELAXED_ORDERING) + bnxt_re_check_and_set_relaxed_ordering(rdev, &mr->qplib_mr); + umem_pgs = ib_umem_num_dma_blocks(umem, page_size); rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, umem, umem_pgs, page_size); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index b452b2f46ceb..049805ac95cf 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -570,4 +570,9 @@ static inline bool _is_alloc_mr_unified(u16 dev_cap_flags) return dev_cap_flags & CREQ_QUERY_FUNC_RESP_SB_MR_REGISTER_ALLOC; } +static inline bool _is_relaxed_ordering_supported(u16 dev_cap_ext_flags2) +{ + return dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_MEMORY_REGION_RO_SUPPORTED; +} + #endif /* __BNXT_QPLIB_RES_H__ */ From 112e6e83a894260cc7efe79a1fc47d4d51461742 Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Mon, 2 Sep 2024 13:35:40 +0300 Subject: [PATCH 53/99] IB/mlx5: Fix UMR pd cleanup on error flow of driver init The cited commit moves the pd allocation from function mlx5r_umr_resource_cleanup() to a new function mlx5r_umr_cleanup(). So the fix in commit [1] is broken. In error flow, will hit panic [2]. Fix it by checking pd pointer to avoid panic if it is NULL; [1] RDMA/mlx5: Fix UMR cleanup on error flow of driver init [2] [ 347.567063] infiniband mlx5_0: Couldn't register device with driver model [ 347.591382] BUG: kernel NULL pointer dereference, address: 0000000000000020 [ 347.593438] #PF: supervisor read access in kernel mode [ 347.595176] #PF: error_code(0x0000) - not-present page [ 347.596962] PGD 0 P4D 0 [ 347.601361] RIP: 0010:ib_dealloc_pd_user+0x12/0xc0 [ib_core] [ 347.604171] RSP: 0018:ffff888106293b10 EFLAGS: 00010282 [ 347.604834] RAX: 0000000000000000 RBX: 000000000000000e RCX: 0000000000000000 [ 347.605672] RDX: ffff888106293ad0 RSI: 0000000000000000 RDI: 0000000000000000 [ 347.606529] RBP: 0000000000000000 R08: ffff888106293ae0 R09: ffff888106293ae0 [ 347.607379] R10: 0000000000000a06 R11: 0000000000000000 R12: 0000000000000000 [ 347.608224] R13: ffffffffa0704dc0 R14: 0000000000000001 R15: 0000000000000001 [ 347.609067] FS: 00007fdc720cd9c0(0000) GS:ffff88852c880000(0000) knlGS:0000000000000000 [ 347.610094] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 347.610727] CR2: 0000000000000020 CR3: 0000000103012003 CR4: 0000000000370eb0 [ 347.611421] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 347.612113] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 347.612804] Call Trace: [ 347.613130] [ 347.613417] ? __die+0x20/0x60 [ 347.613793] ? page_fault_oops+0x150/0x3e0 [ 347.614243] ? free_msg+0x68/0x80 [mlx5_core] [ 347.614840] ? cmd_exec+0x48f/0x11d0 [mlx5_core] [ 347.615359] ? exc_page_fault+0x74/0x130 [ 347.615808] ? asm_exc_page_fault+0x22/0x30 [ 347.616273] ? ib_dealloc_pd_user+0x12/0xc0 [ib_core] [ 347.616801] mlx5r_umr_cleanup+0x23/0x90 [mlx5_ib] [ 347.617365] mlx5_ib_stage_pre_ib_reg_umr_cleanup+0x36/0x40 [mlx5_ib] [ 347.618025] __mlx5_ib_add+0x96/0xd0 [mlx5_ib] [ 347.618539] mlx5r_probe+0xe9/0x310 [mlx5_ib] [ 347.619032] ? kernfs_add_one+0x107/0x150 [ 347.619478] ? __mlx5_ib_add+0xd0/0xd0 [mlx5_ib] [ 347.619984] auxiliary_bus_probe+0x3e/0x90 [ 347.620448] really_probe+0xc5/0x3a0 [ 347.620857] __driver_probe_device+0x80/0x160 [ 347.621325] driver_probe_device+0x1e/0x90 [ 347.621770] __driver_attach+0xec/0x1c0 [ 347.622213] ? __device_attach_driver+0x100/0x100 [ 347.622724] bus_for_each_dev+0x71/0xc0 [ 347.623151] bus_add_driver+0xed/0x240 [ 347.623570] driver_register+0x58/0x100 [ 347.623998] __auxiliary_driver_register+0x6a/0xc0 [ 347.624499] ? driver_register+0xae/0x100 [ 347.624940] ? 0xffffffffa0893000 [ 347.625329] mlx5_ib_init+0x16a/0x1e0 [mlx5_ib] [ 347.625845] do_one_initcall+0x4a/0x2a0 [ 347.626273] ? gcov_event+0x2e2/0x3a0 [ 347.626706] do_init_module+0x8a/0x260 [ 347.627126] init_module_from_file+0x8b/0xd0 [ 347.627596] __x64_sys_finit_module+0x1ca/0x2f0 [ 347.628089] do_syscall_64+0x4c/0x100 Fixes: 638420115cc4 ("IB/mlx5: Create UMR QP just before first reg_mr occurs") Signed-off-by: Chris Mi Reviewed-by: Jianbo Liu Link: https://patch.msgid.link/778c40c60287992da5d6ec92bb07b67f7bb5e6ef.1725273295.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/umr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c index eb74c163fd83..887fd6fa3ba9 100644 --- a/drivers/infiniband/hw/mlx5/umr.c +++ b/drivers/infiniband/hw/mlx5/umr.c @@ -224,6 +224,9 @@ int mlx5r_umr_init(struct mlx5_ib_dev *dev) void mlx5r_umr_cleanup(struct mlx5_ib_dev *dev) { + if (!dev->umrc.pd) + return; + mutex_destroy(&dev->umrc.init_lock); ib_dealloc_pd(dev->umrc.pd); } From 1403c8b14765eab805377dd3b75e96ace8747aed Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Mon, 2 Sep 2024 13:36:33 +0300 Subject: [PATCH 54/99] IB/core: Fix ib_cache_setup_one error flow cleanup When ib_cache_update return an error, we exit ib_cache_setup_one instantly with no proper cleanup, even though before this we had already successfully done gid_table_setup_one, that results in the kernel WARN below. Do proper cleanup using gid_table_cleanup_one before returning the err in order to fix the issue. WARNING: CPU: 4 PID: 922 at drivers/infiniband/core/cache.c:806 gid_table_release_one+0x181/0x1a0 Modules linked in: CPU: 4 UID: 0 PID: 922 Comm: c_repro Not tainted 6.11.0-rc1+ #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:gid_table_release_one+0x181/0x1a0 Code: 44 8b 38 75 0c e8 2f cb 34 ff 4d 8b b5 28 05 00 00 e8 23 cb 34 ff 44 89 f9 89 da 4c 89 f6 48 c7 c7 d0 58 14 83 e8 4f de 21 ff <0f> 0b 4c 8b 75 30 e9 54 ff ff ff 48 8 3 c4 10 5b 5d 41 5c 41 5d 41 RSP: 0018:ffffc90002b835b0 EFLAGS: 00010286 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff811c8527 RDX: 0000000000000000 RSI: ffffffff811c8534 RDI: 0000000000000001 RBP: ffff8881011b3d00 R08: ffff88810b3abe00 R09: 205d303839303631 R10: 666572207972746e R11: 72746e6520444947 R12: 0000000000000001 R13: ffff888106390000 R14: ffff8881011f2110 R15: 0000000000000001 FS: 00007fecc3b70800(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000340 CR3: 000000010435a001 CR4: 00000000003706b0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? show_regs+0x94/0xa0 ? __warn+0x9e/0x1c0 ? gid_table_release_one+0x181/0x1a0 ? report_bug+0x1f9/0x340 ? gid_table_release_one+0x181/0x1a0 ? handle_bug+0xa2/0x110 ? exc_invalid_op+0x31/0xa0 ? asm_exc_invalid_op+0x16/0x20 ? __warn_printk+0xc7/0x180 ? __warn_printk+0xd4/0x180 ? gid_table_release_one+0x181/0x1a0 ib_device_release+0x71/0xe0 ? __pfx_ib_device_release+0x10/0x10 device_release+0x44/0xd0 kobject_put+0x135/0x3d0 put_device+0x20/0x30 rxe_net_add+0x7d/0xa0 rxe_newlink+0xd7/0x190 nldev_newlink+0x1b0/0x2a0 ? __pfx_nldev_newlink+0x10/0x10 rdma_nl_rcv_msg+0x1ad/0x2e0 rdma_nl_rcv_skb.constprop.0+0x176/0x210 netlink_unicast+0x2de/0x400 netlink_sendmsg+0x306/0x660 __sock_sendmsg+0x110/0x120 ____sys_sendmsg+0x30e/0x390 ___sys_sendmsg+0x9b/0xf0 ? kstrtouint+0x6e/0xa0 ? kstrtouint_from_user+0x7c/0xb0 ? get_pid_task+0xb0/0xd0 ? proc_fail_nth_write+0x5b/0x140 ? __fget_light+0x9a/0x200 ? preempt_count_add+0x47/0xa0 __sys_sendmsg+0x61/0xd0 do_syscall_64+0x50/0x110 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fixes: 1901b91f9982 ("IB/core: Fix potential NULL pointer dereference in pkey cache") Signed-off-by: Patrisious Haddad Reviewed-by: Maher Sanalla Link: https://patch.msgid.link/79137687d829899b0b1c9835fcb4b258004c439a.1725273354.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cache.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 6791df64a5fe..b7c078b7f7cf 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -1640,8 +1640,10 @@ int ib_cache_setup_one(struct ib_device *device) rdma_for_each_port (device, p) { err = ib_cache_update(device, p, true, true, true); - if (err) + if (err) { + gid_table_cleanup_one(device); return err; + } } return 0; From 34efda1735a179cd233479a99f09728825748ea1 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Mon, 2 Sep 2024 13:37:03 +0300 Subject: [PATCH 55/99] RDMA/mlx5: Enable ATS when allocating kernel MRs When creating kernel MRs, it is not definitive whether they will be used for peer-to-peer transactions or for other usecases, since address mapping is performed only after the MR is created. Since peer-to-peer transactions benefit significantly from ATS performance-wise, enable ATS on newly-allocated kernel MRs when supported. Signed-off-by: Maher Sanalla Reviewed-by: Gal Shalom Link: https://patch.msgid.link/fafd4c9f14cf438d2882d88649c2947e1d05d0b4.1725273403.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mr.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 6829e3688b60..250c246ae792 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1065,6 +1065,7 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) MLX5_SET(mkc, mkc, length64, 1); set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0, pd); + MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats)); err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); if (err) @@ -2156,6 +2157,7 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, int access_mode, int page_shift) { + struct mlx5_ib_dev *dev = to_mdev(pd->device); void *mkc; mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); @@ -2168,6 +2170,9 @@ static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); MLX5_SET(mkc, mkc, umr_en, 1); MLX5_SET(mkc, mkc, log_page_size, page_shift); + if (access_mode == MLX5_MKC_ACCESS_MODE_PA || + access_mode == MLX5_MKC_ACCESS_MODE_MTT) + MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats)); } static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, From b24506f1c3c4e3379babf7c59e4873c862e674cb Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Mon, 2 Sep 2024 19:29:18 +0800 Subject: [PATCH 56/99] RDMA/erdma: Refactor the initialization and destruction of EQ We extracted the common parts of the initialization/destruction process to make the code cleaner. Signed-off-by: Cheng Xu Link: https://patch.msgid.link/20240902112920.58749-2-chengyou@linux.alibaba.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma.h | 3 +- drivers/infiniband/hw/erdma/erdma_cmdq.c | 26 ++----- drivers/infiniband/hw/erdma/erdma_eq.c | 89 ++++++++++++------------ drivers/infiniband/hw/erdma/erdma_main.c | 4 +- 4 files changed, 52 insertions(+), 70 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h index c8bd698e21b0..3c166359448d 100644 --- a/drivers/infiniband/hw/erdma/erdma.h +++ b/drivers/infiniband/hw/erdma/erdma.h @@ -274,7 +274,8 @@ void notify_eq(struct erdma_eq *eq); void *get_next_valid_eqe(struct erdma_eq *eq); int erdma_aeq_init(struct erdma_dev *dev); -void erdma_aeq_destroy(struct erdma_dev *dev); +int erdma_eq_common_init(struct erdma_dev *dev, struct erdma_eq *eq, u32 depth); +void erdma_eq_destroy(struct erdma_dev *dev, struct erdma_eq *eq); void erdma_aeq_event_handler(struct erdma_dev *dev); void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb); diff --git a/drivers/infiniband/hw/erdma/erdma_cmdq.c b/drivers/infiniband/hw/erdma/erdma_cmdq.c index 43ff40b5a09d..a3d8922d1ad1 100644 --- a/drivers/infiniband/hw/erdma/erdma_cmdq.c +++ b/drivers/infiniband/hw/erdma/erdma_cmdq.c @@ -158,20 +158,13 @@ static int erdma_cmdq_eq_init(struct erdma_dev *dev) { struct erdma_cmdq *cmdq = &dev->cmdq; struct erdma_eq *eq = &cmdq->eq; + int ret; - eq->depth = cmdq->max_outstandings; - eq->qbuf = dma_alloc_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, - &eq->qbuf_dma_addr, GFP_KERNEL); - if (!eq->qbuf) - return -ENOMEM; - - spin_lock_init(&eq->lock); - atomic64_set(&eq->event_num, 0); + ret = erdma_eq_common_init(dev, eq, cmdq->max_outstandings); + if (ret) + return ret; eq->db = dev->func_bar + ERDMA_REGS_CEQ_DB_BASE_REG; - eq->dbrec = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, &eq->dbrec_dma); - if (!eq->dbrec) - goto err_out; erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_ADDR_H_REG, upper_32_bits(eq->qbuf_dma_addr)); @@ -181,12 +174,6 @@ static int erdma_cmdq_eq_init(struct erdma_dev *dev) erdma_reg_write64(dev, ERDMA_CMDQ_EQ_DB_HOST_ADDR_REG, eq->dbrec_dma); return 0; - -err_out: - dma_free_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, eq->qbuf, - eq->qbuf_dma_addr); - - return -ENOMEM; } int erdma_cmdq_init(struct erdma_dev *dev) @@ -247,10 +234,7 @@ void erdma_cmdq_destroy(struct erdma_dev *dev) clear_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state); - dma_free_coherent(&dev->pdev->dev, cmdq->eq.depth << EQE_SHIFT, - cmdq->eq.qbuf, cmdq->eq.qbuf_dma_addr); - - dma_pool_free(dev->db_pool, cmdq->eq.dbrec, cmdq->eq.dbrec_dma); + erdma_eq_destroy(dev, &cmdq->eq); dma_free_coherent(&dev->pdev->dev, cmdq->sq.depth << SQEBB_SHIFT, cmdq->sq.qbuf, cmdq->sq.qbuf_dma_addr); diff --git a/drivers/infiniband/hw/erdma/erdma_eq.c b/drivers/infiniband/hw/erdma/erdma_eq.c index 84ccdd8144c9..9a72fec6d5cc 100644 --- a/drivers/infiniband/hw/erdma/erdma_eq.c +++ b/drivers/infiniband/hw/erdma/erdma_eq.c @@ -80,25 +80,51 @@ void erdma_aeq_event_handler(struct erdma_dev *dev) notify_eq(&dev->aeq); } -int erdma_aeq_init(struct erdma_dev *dev) +int erdma_eq_common_init(struct erdma_dev *dev, struct erdma_eq *eq, u32 depth) { - struct erdma_eq *eq = &dev->aeq; + u32 buf_size = depth << EQE_SHIFT; - eq->depth = ERDMA_DEFAULT_EQ_DEPTH; - - eq->qbuf = dma_alloc_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, + eq->qbuf = dma_alloc_coherent(&dev->pdev->dev, buf_size, &eq->qbuf_dma_addr, GFP_KERNEL); if (!eq->qbuf) return -ENOMEM; + eq->dbrec = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, &eq->dbrec_dma); + if (!eq->dbrec) + goto err_free_qbuf; + spin_lock_init(&eq->lock); atomic64_set(&eq->event_num, 0); atomic64_set(&eq->notify_num, 0); + eq->ci = 0; + eq->depth = depth; + + return 0; + +err_free_qbuf: + dma_free_coherent(&dev->pdev->dev, buf_size, eq->qbuf, + eq->qbuf_dma_addr); + + return -ENOMEM; +} + +void erdma_eq_destroy(struct erdma_dev *dev, struct erdma_eq *eq) +{ + dma_pool_free(dev->db_pool, eq->dbrec, eq->dbrec_dma); + dma_free_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, eq->qbuf, + eq->qbuf_dma_addr); +} + +int erdma_aeq_init(struct erdma_dev *dev) +{ + struct erdma_eq *eq = &dev->aeq; + int ret; + + ret = erdma_eq_common_init(dev, &dev->aeq, ERDMA_DEFAULT_EQ_DEPTH); + if (ret) + return ret; eq->db = dev->func_bar + ERDMA_REGS_AEQ_DB_REG; - eq->dbrec = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, &eq->dbrec_dma); - if (!eq->dbrec) - goto err_out; erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_H_REG, upper_32_bits(eq->qbuf_dma_addr)); @@ -108,22 +134,6 @@ int erdma_aeq_init(struct erdma_dev *dev) erdma_reg_write64(dev, ERDMA_AEQ_DB_HOST_ADDR_REG, eq->dbrec_dma); return 0; - -err_out: - dma_free_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, eq->qbuf, - eq->qbuf_dma_addr); - - return -ENOMEM; -} - -void erdma_aeq_destroy(struct erdma_dev *dev) -{ - struct erdma_eq *eq = &dev->aeq; - - dma_free_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, eq->qbuf, - eq->qbuf_dma_addr); - - dma_pool_free(dev->db_pool, eq->dbrec, eq->dbrec_dma); } void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb) @@ -234,32 +244,21 @@ static int erdma_ceq_init_one(struct erdma_dev *dev, u16 ceqn) struct erdma_eq *eq = &dev->ceqs[ceqn].eq; int ret; - eq->depth = ERDMA_DEFAULT_EQ_DEPTH; - eq->qbuf = dma_alloc_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, - &eq->qbuf_dma_addr, GFP_KERNEL); - if (!eq->qbuf) - return -ENOMEM; - - spin_lock_init(&eq->lock); - atomic64_set(&eq->event_num, 0); - atomic64_set(&eq->notify_num, 0); + ret = erdma_eq_common_init(dev, eq, ERDMA_DEFAULT_EQ_DEPTH); + if (ret) + return ret; eq->db = dev->func_bar + ERDMA_REGS_CEQ_DB_BASE_REG + (ceqn + 1) * ERDMA_DB_SIZE; - - eq->dbrec = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, &eq->dbrec_dma); - if (!eq->dbrec) { - dma_free_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, - eq->qbuf, eq->qbuf_dma_addr); - return -ENOMEM; - } - - eq->ci = 0; dev->ceqs[ceqn].dev = dev; + dev->ceqs[ceqn].ready = true; /* CEQ indexed from 1, 0 rsvd for CMDQ-EQ. */ ret = create_eq_cmd(dev, ceqn + 1, eq); - dev->ceqs[ceqn].ready = ret ? false : true; + if (ret) { + erdma_eq_destroy(dev, eq); + dev->ceqs[ceqn].ready = false; + } return ret; } @@ -283,9 +282,7 @@ static void erdma_ceq_uninit_one(struct erdma_dev *dev, u16 ceqn) if (err) return; - dma_free_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, eq->qbuf, - eq->qbuf_dma_addr); - dma_pool_free(dev->db_pool, eq->dbrec, eq->dbrec_dma); + erdma_eq_destroy(dev, eq); } int erdma_ceqs_init(struct erdma_dev *dev) diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index 7080f8a71ec4..9defbd55893a 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -333,7 +333,7 @@ err_uninit_cmdq: erdma_cmdq_destroy(dev); err_uninit_aeq: - erdma_aeq_destroy(dev); + erdma_eq_destroy(dev, &dev->aeq); err_uninit_comm_irq: erdma_comm_irq_uninit(dev); @@ -366,7 +366,7 @@ static void erdma_remove_dev(struct pci_dev *pdev) erdma_ceqs_uninit(dev); erdma_hw_reset(dev); erdma_cmdq_destroy(dev); - erdma_aeq_destroy(dev); + erdma_eq_destroy(dev, &dev->aeq); erdma_comm_irq_uninit(dev); pci_free_irq_vectors(dev->pdev); erdma_device_uninit(dev); From b80330f1051d4e89d234a191db99caad5fbd8cbc Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Mon, 2 Sep 2024 19:29:19 +0800 Subject: [PATCH 57/99] RDMA/erdma: Add disassociate ucontext support All IO pages mapped to user space are handled by rdma_user_mmap_io, so add empty stub for disassociate ucontext. Signed-off-by: Cheng Xu Link: https://patch.msgid.link/20240902112920.58749-3-chengyou@linux.alibaba.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma_main.c | 1 + drivers/infiniband/hw/erdma/erdma_verbs.c | 4 ++++ drivers/infiniband/hw/erdma/erdma_verbs.h | 1 + 3 files changed, 6 insertions(+) diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index 9defbd55893a..62f497a71004 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -490,6 +490,7 @@ static const struct ib_device_ops erdma_device_ops = { .dereg_mr = erdma_dereg_mr, .destroy_cq = erdma_destroy_cq, .destroy_qp = erdma_destroy_qp, + .disassociate_ucontext = erdma_disassociate_ucontext, .get_dma_mr = erdma_get_dma_mr, .get_hw_stats = erdma_get_hw_stats, .get_port_immutable = erdma_get_port_immutable, diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index d7e1cbf9f5c2..1d2ae83af0d3 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -1701,6 +1701,10 @@ err_out_xa: return ret; } +void erdma_disassociate_ucontext(struct ib_ucontext *ibcontext) +{ +} + void erdma_set_mtu(struct erdma_dev *dev, u32 mtu) { struct erdma_cmdq_config_mtu_req req; diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index 6afdc02f5869..c998acd39a78 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -344,6 +344,7 @@ int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, struct ib_udata *data); int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); int erdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); +void erdma_disassociate_ucontext(struct ib_ucontext *ibcontext); int erdma_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); struct ib_mr *erdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, u64 virt, int access, struct ib_udata *udata); From e77127ff6416b17e0b3e630ac46ee5c9a6570f57 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Mon, 2 Sep 2024 19:29:20 +0800 Subject: [PATCH 58/99] RDMA/erdma: Return QP state in erdma_query_qp Fix qp_state and cur_qp_state to return correct values in struct ib_qp_attr. Fixes: 155055771704 ("RDMA/erdma: Add verbs implementation") Signed-off-by: Cheng Xu Link: https://patch.msgid.link/20240902112920.58749-4-chengyou@linux.alibaba.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma_verbs.c | 25 ++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index 1d2ae83af0d3..51d619edb6c5 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -1544,11 +1544,31 @@ int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, return ret; } +static enum ib_qp_state query_qp_state(struct erdma_qp *qp) +{ + switch (qp->attrs.state) { + case ERDMA_QP_STATE_IDLE: + return IB_QPS_INIT; + case ERDMA_QP_STATE_RTR: + return IB_QPS_RTR; + case ERDMA_QP_STATE_RTS: + return IB_QPS_RTS; + case ERDMA_QP_STATE_CLOSING: + return IB_QPS_ERR; + case ERDMA_QP_STATE_TERMINATE: + return IB_QPS_ERR; + case ERDMA_QP_STATE_ERROR: + return IB_QPS_ERR; + default: + return IB_QPS_ERR; + } +} + int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) { - struct erdma_qp *qp; struct erdma_dev *dev; + struct erdma_qp *qp; if (ibqp && qp_attr && qp_init_attr) { qp = to_eqp(ibqp); @@ -1575,6 +1595,9 @@ int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, qp_init_attr->cap = qp_attr->cap; + qp_attr->qp_state = query_qp_state(qp); + qp_attr->cur_qp_state = query_qp_state(qp); + return 0; } From 30e6bd8d3b5639f8f4261e5e6c0917ce264b8dc2 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Tue, 3 Sep 2024 14:24:47 +0300 Subject: [PATCH 59/99] RDMA/mlx5: Drop redundant work canceling from clean_keys() The canceling of dealyed work in clean_keys() is a leftover from years back and was added to prevent races in the cleanup process of MR cache. The cleanup process was rewritten a few years ago and the canceling of delayed work and flushing of workqueue was added before the call to clean_keys(). Signed-off-by: Michael Guralnik Link: https://patch.msgid.link/943d21f5a9dba7b98a3e1d531e3561ffe9745d71.1725362530.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mr.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 250c246ae792..511d50491352 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -772,7 +772,6 @@ static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent) { u32 mkey; - cancel_delayed_work(&ent->dwork); spin_lock_irq(&ent->mkeys_queue.lock); while (ent->mkeys_queue.ci) { mkey = pop_mkey_locked(ent); From 6f5cd6ac9a4201e4ba6f10b76a9da8044d6e38b0 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Tue, 3 Sep 2024 14:24:48 +0300 Subject: [PATCH 60/99] RDMA/mlx5: Fix counter update on MR cache mkey creation After an mkey is created, update the counter for pending mkeys before reshceduling the work that is filling the cache. Rescheduling the work with a full MR cache entry and a wrong 'pending' counter will cause us to miss disabling the fill_to_high_water flag. Thus leaving the cache full but with an indication that it's still needs to be filled up to it's full size (2 * limit). Next time an mkey will be taken from the cache, we'll unnecessarily continue the process of filling the cache to it's full size. Fixes: 57e7071683ef ("RDMA/mlx5: Implement mkeys management via LIFO queue") Signed-off-by: Michael Guralnik Link: https://patch.msgid.link/0f44f462ba22e45f72cb3d0ec6a748634086b8d0.1725362530.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 511d50491352..0f90086327fc 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -214,9 +214,9 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context) spin_lock_irqsave(&ent->mkeys_queue.lock, flags); push_mkey_locked(ent, mkey_out->mkey); + ent->pending--; /* If we are doing fill_to_high_water then keep going. */ queue_adjust_cache_locked(ent); - ent->pending--; spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags); kfree(mkey_out); } From ee6d57a2e13d11ce9050cfc3e3b69ef707a44a63 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Tue, 3 Sep 2024 14:24:49 +0300 Subject: [PATCH 61/99] RDMA/mlx5: Limit usage of over-sized mkeys from the MR cache When searching the MR cache for suitable cache entries, don't use mkeys larger than twice the size required for the MR. This should ensure the usage of mkeys closer to the minimal required size and reduce memory waste. On driver init we create entries for mkeys with clear attributes and powers of 2 sizes from 4 to the max supported size. This solves the issue for anyone using mkeys that fit these requirements. In the use case where an MR is registered with different attributes, like an access flag we can't UMR, we'll create a new cache entry to store it upon dereg. Without this fix, any later registration with same attributes and smaller size will use the newly created cache entry and it's mkeys, disregarding the memory waste of using mkeys larger than required. For example, one worst-case scenario can be when registering and deregistering a 1GB mkey with ATS enabled which will cause the creation of a new cache entry to hold those type of mkeys. A user registering a 4k MR with ATS will end up using the new cache entry and an mkey that can support a 1GB MR, thus wasting x250k memory than actually needed in the HW. Additionally, allow all small registration to use the smallest size cache entry that is initialized on driver load even if size is larger than twice the required size. Fixes: 73d09b2fe833 ("RDMA/mlx5: Introduce mlx5r_cache_rb_key") Signed-off-by: Michael Guralnik Link: https://patch.msgid.link/8ba3a6e3748aace2026de8b83da03aba084f78f4.1725362530.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mr.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 0f90086327fc..81644bb75054 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -49,6 +49,7 @@ enum { MAX_PENDING_REG_MR = 8, }; +#define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4 #define MLX5_UMR_ALIGN 2048 static void @@ -662,6 +663,7 @@ mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev, { struct rb_node *node = dev->cache.rb_root.rb_node; struct mlx5_cache_ent *cur, *smallest = NULL; + u64 ndescs_limit; int cmp; /* @@ -680,10 +682,18 @@ mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev, return cur; } + /* + * Limit the usage of mkeys larger than twice the required size while + * also allowing the usage of smallest cache entry for small MRs. + */ + ndescs_limit = max_t(u64, rb_key.ndescs * 2, + MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS); + return (smallest && smallest->rb_key.access_mode == rb_key.access_mode && smallest->rb_key.access_flags == rb_key.access_flags && - smallest->rb_key.ats == rb_key.ats) ? + smallest->rb_key.ats == rb_key.ats && + smallest->rb_key.ndescs <= ndescs_limit) ? smallest : NULL; } @@ -964,7 +974,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) mlx5_mkey_cache_debugfs_init(dev); mutex_lock(&cache->rb_lock); for (i = 0; i <= mkey_cache_max_order(dev); i++) { - rb_key.ndescs = 1 << (i + 2); + rb_key.ndescs = MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS << i; ent = mlx5r_cache_create_ent_locked(dev, rb_key, true); if (IS_ERR(ent)) { ret = PTR_ERR(ent); From 7ebb00cea49db641b458edef0ede389f7004821d Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Tue, 3 Sep 2024 14:24:50 +0300 Subject: [PATCH 62/99] RDMA/mlx5: Fix MR cache temp entries cleanup Fix the cleanup of the temp cache entries that are dynamically created in the MR cache. The cleanup of the temp cache entries is currently scheduled only when a new entry is created. Since in the cleanup of the entries only the mkeys are destroyed and the cache entry stays in the cache, subsequent registrations might reuse the entry and it will eventually be filled with new mkeys without cleanup ever getting scheduled again. On workloads that register and deregister MRs with a wide range of properties we see the cache ends up holding many cache entries, each holding the max number of mkeys that were ever used through it. Additionally, as the cleanup work is scheduled to run over the whole cache, any mkey that is returned to the cache after the cleanup was scheduled will be held for less than the intended 30 seconds timeout. Solve both issues by dropping the existing remove_ent_work and reusing the existing per-entry work to also handle the temp entries cleanup. Schedule the work to run with a 30 seconds delay every time we push an mkey to a clean temp entry. This ensures the cleanup runs on each entry only 30 seconds after the first mkey was pushed to an empty entry. As we have already been distinguishing between persistent and temp entries when scheduling the cache_work_func, it is not being scheduled in any other flows for the temp entries. Another benefit from moving to a per-entry cleanup is we now not required to hold the rb_tree mutex, thus enabling other flow to run concurrently. Fixes: dd1b913fb0d0 ("RDMA/mlx5: Cache all user cacheable mkeys on dereg MR flow") Signed-off-by: Michael Guralnik Link: https://patch.msgid.link/e4fa4bb03bebf20dceae320f26816cd2dde23a26.1725362530.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +- drivers/infiniband/hw/mlx5/mr.c | 82 +++++++++++----------------- 2 files changed, 32 insertions(+), 52 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index c0b1a9cd752b..5505eb70939b 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -802,6 +802,7 @@ struct mlx5_cache_ent { u8 is_tmp:1; u8 disabled:1; u8 fill_to_high_water:1; + u8 tmp_cleanup_scheduled:1; /* * - limit is the low water mark for stored mkeys, 2* limit is the @@ -833,7 +834,6 @@ struct mlx5_mkey_cache { struct mutex rb_lock; struct dentry *fs_root; unsigned long last_add; - struct delayed_work remove_ent_dwork; }; struct mlx5_ib_port_resources { diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 81644bb75054..d5b5cd73e20c 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -531,6 +531,21 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) } } +static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent) +{ + u32 mkey; + + spin_lock_irq(&ent->mkeys_queue.lock); + while (ent->mkeys_queue.ci) { + mkey = pop_mkey_locked(ent); + spin_unlock_irq(&ent->mkeys_queue.lock); + mlx5_core_destroy_mkey(dev->mdev, mkey); + spin_lock_irq(&ent->mkeys_queue.lock); + } + ent->tmp_cleanup_scheduled = false; + spin_unlock_irq(&ent->mkeys_queue.lock); +} + static void __cache_work_func(struct mlx5_cache_ent *ent) { struct mlx5_ib_dev *dev = ent->dev; @@ -602,7 +617,11 @@ static void delayed_cache_work_func(struct work_struct *work) struct mlx5_cache_ent *ent; ent = container_of(work, struct mlx5_cache_ent, dwork.work); - __cache_work_func(ent); + /* temp entries are never filled, only cleaned */ + if (ent->is_tmp) + clean_keys(ent->dev, ent); + else + __cache_work_func(ent); } static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1, @@ -778,20 +797,6 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, return _mlx5_mr_cache_alloc(dev, ent, access_flags); } -static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent) -{ - u32 mkey; - - spin_lock_irq(&ent->mkeys_queue.lock); - while (ent->mkeys_queue.ci) { - mkey = pop_mkey_locked(ent); - spin_unlock_irq(&ent->mkeys_queue.lock); - mlx5_core_destroy_mkey(dev->mdev, mkey); - spin_lock_irq(&ent->mkeys_queue.lock); - } - spin_unlock_irq(&ent->mkeys_queue.lock); -} - static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) { if (!mlx5_debugfs_root || dev->is_rep) @@ -904,10 +909,6 @@ mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev, ent->limit = 0; mlx5_mkey_cache_debugfs_add_ent(dev, ent); - } else { - mod_delayed_work(ent->dev->cache.wq, - &ent->dev->cache.remove_ent_dwork, - msecs_to_jiffies(30 * 1000)); } return ent; @@ -918,35 +919,6 @@ mkeys_err: return ERR_PTR(ret); } -static void remove_ent_work_func(struct work_struct *work) -{ - struct mlx5_mkey_cache *cache; - struct mlx5_cache_ent *ent; - struct rb_node *cur; - - cache = container_of(work, struct mlx5_mkey_cache, - remove_ent_dwork.work); - mutex_lock(&cache->rb_lock); - cur = rb_last(&cache->rb_root); - while (cur) { - ent = rb_entry(cur, struct mlx5_cache_ent, node); - cur = rb_prev(cur); - mutex_unlock(&cache->rb_lock); - - spin_lock_irq(&ent->mkeys_queue.lock); - if (!ent->is_tmp) { - spin_unlock_irq(&ent->mkeys_queue.lock); - mutex_lock(&cache->rb_lock); - continue; - } - spin_unlock_irq(&ent->mkeys_queue.lock); - - clean_keys(ent->dev, ent); - mutex_lock(&cache->rb_lock); - } - mutex_unlock(&cache->rb_lock); -} - int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) { struct mlx5_mkey_cache *cache = &dev->cache; @@ -962,7 +934,6 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) mutex_init(&dev->slow_path_mutex); mutex_init(&dev->cache.rb_lock); dev->cache.rb_root = RB_ROOT; - INIT_DELAYED_WORK(&dev->cache.remove_ent_dwork, remove_ent_work_func); cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); if (!cache->wq) { mlx5_ib_warn(dev, "failed to create work queue\n"); @@ -1013,7 +984,6 @@ void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) return; mutex_lock(&dev->cache.rb_lock); - cancel_delayed_work(&dev->cache.remove_ent_dwork); for (node = rb_first(root); node; node = rb_next(node)) { ent = rb_entry(node, struct mlx5_cache_ent, node); spin_lock_irq(&ent->mkeys_queue.lock); @@ -2054,8 +2024,18 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); struct mlx5_cache_ent *ent = mr->mmkey.cache_ent; - if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) + if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) { + ent = mr->mmkey.cache_ent; + /* upon storing to a clean temp entry - schedule its cleanup */ + spin_lock_irq(&ent->mkeys_queue.lock); + if (ent->is_tmp && !ent->tmp_cleanup_scheduled) { + mod_delayed_work(ent->dev->cache.wq, &ent->dwork, + msecs_to_jiffies(30 * 1000)); + ent->tmp_cleanup_scheduled = true; + } + spin_unlock_irq(&ent->mkeys_queue.lock); return 0; + } if (ent) { spin_lock_irq(&ent->mkeys_queue.lock); From c6b2b5c86d448630cea58bf6fdfc761da3e3efb5 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Wed, 4 Sep 2024 03:04:12 -0700 Subject: [PATCH 63/99] RDMA/bnxt_re: Fix the compatibility flag for variable size WQE For older adapters that doesn't support variable size WQE, driver is wrongly reporting that variable WQE is supported, when the latest library is used. Report the variable WQE capability only if the driver supports it. Fixes: 10a104c0debb ("RDMA/bnxt_re: Enable variable size WQEs for user space applications") Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1725444253-13221-2-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 82c1f3b2f825..ecee691ed1e0 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -4272,7 +4272,8 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) if (ureq.comp_mask & BNXT_RE_COMP_MASK_REQ_UCNTX_VAR_WQE_SUPPORT) { resp.comp_mask |= BNXT_RE_UCNTX_CMASK_HAVE_MODE; resp.mode = rdev->chip_ctx->modes.wqe_mode; - uctx->cmask |= BNXT_RE_UCNTX_CAP_VAR_WQE_ENABLED; + if (resp.mode == BNXT_QPLIB_WQE_MODE_VARIABLE) + uctx->cmask |= BNXT_RE_UCNTX_CAP_VAR_WQE_ENABLED; } } From 227f51743b61fe3f6fc481f0fb8086bf8c49b8c9 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Wed, 4 Sep 2024 03:04:13 -0700 Subject: [PATCH 64/99] RDMA/bnxt_re: Fix the max WQE size for static WQE support When variable size WQE is supported, max_qp_sges reported is more than 6. For devices that supports variable size WQE, the Send WQE size calculation is wrong when an an older library that doesn't support variable size WQE is used. Set the WQE size to 128 when static WQE is supported. Fixes: de1d364c3815 ("RDMA/bnxt_re: Add support for Variable WQE in Genp7 adapters") Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1725444253-13221-3-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 21 ++++++++++----------- drivers/infiniband/hw/bnxt_re/qplib_sp.h | 2 ++ 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index ecee691ed1e0..460f33914825 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -1006,23 +1006,22 @@ static int bnxt_re_setup_swqe_size(struct bnxt_re_qp *qp, align = sizeof(struct sq_send_hdr); ilsize = ALIGN(init_attr->cap.max_inline_data, align); - sq->wqe_size = bnxt_re_get_wqe_size(ilsize, sq->max_sge); - if (sq->wqe_size > bnxt_re_get_swqe_size(dev_attr->max_qp_sges)) - return -EINVAL; - /* For gen p4 and gen p5 backward compatibility mode - * wqe size is fixed to 128 bytes + /* For gen p4 and gen p5 fixed wqe compatibility mode + * wqe size is fixed to 128 bytes - ie 6 SGEs */ - if (sq->wqe_size < bnxt_re_get_swqe_size(dev_attr->max_qp_sges) && - qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) - sq->wqe_size = bnxt_re_get_swqe_size(dev_attr->max_qp_sges); + if (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) { + sq->wqe_size = bnxt_re_get_swqe_size(BNXT_STATIC_MAX_SGE); + sq->max_sge = BNXT_STATIC_MAX_SGE; + } else { + sq->wqe_size = bnxt_re_get_wqe_size(ilsize, sq->max_sge); + if (sq->wqe_size > bnxt_re_get_swqe_size(dev_attr->max_qp_sges)) + return -EINVAL; + } if (init_attr->cap.max_inline_data) { qplqp->max_inline_data = sq->wqe_size - sizeof(struct sq_send_hdr); init_attr->cap.max_inline_data = qplqp->max_inline_data; - if (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) - sq->max_sge = qplqp->max_inline_data / - sizeof(struct sq_sge); } return 0; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index 4ce44aabfdc1..acd9c14a31c4 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -358,4 +358,6 @@ int bnxt_qplib_modify_cc(struct bnxt_qplib_res *res, #define BNXT_VAR_MAX_SGE 13 #define BNXT_RE_MAX_RQ_WQES 65536 +#define BNXT_STATIC_MAX_SGE 6 + #endif /* __BNXT_QPLIB_SP_H__*/ From 6928d264e328e0cb5ee7663003a6e46e4cba0a7e Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Fri, 6 Sep 2024 17:34:36 +0800 Subject: [PATCH 65/99] RDMA/hns: Don't modify rq next block addr in HIP09 QPC The field 'rq next block addr' in QPC can be updated by driver only on HIP08. On HIP09 HW updates this field while driver is not allowed. Fixes: 926a01dc000d ("RDMA/hns: Add QP operations support for hip08 SoC") Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20240906093444.3571619-2-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 621b057fb9da..a166b476977f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -4423,12 +4423,14 @@ static int config_qp_rq_buf(struct hns_roce_dev *hr_dev, upper_32_bits(to_hr_hw_page_addr(mtts[0]))); hr_reg_clear(qpc_mask, QPC_RQ_CUR_BLK_ADDR_H); - context->rq_nxt_blk_addr = cpu_to_le32(to_hr_hw_page_addr(mtts[1])); - qpc_mask->rq_nxt_blk_addr = 0; - - hr_reg_write(context, QPC_RQ_NXT_BLK_ADDR_H, - upper_32_bits(to_hr_hw_page_addr(mtts[1]))); - hr_reg_clear(qpc_mask, QPC_RQ_NXT_BLK_ADDR_H); + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) { + context->rq_nxt_blk_addr = + cpu_to_le32(to_hr_hw_page_addr(mtts[1])); + qpc_mask->rq_nxt_blk_addr = 0; + hr_reg_write(context, QPC_RQ_NXT_BLK_ADDR_H, + upper_32_bits(to_hr_hw_page_addr(mtts[1]))); + hr_reg_clear(qpc_mask, QPC_RQ_NXT_BLK_ADDR_H); + } return 0; } From fd8489294dd2beefb70f12ec4f6132aeec61a4d0 Mon Sep 17 00:00:00 2001 From: wenglianfa Date: Fri, 6 Sep 2024 17:34:37 +0800 Subject: [PATCH 66/99] RDMA/hns: Fix Use-After-Free of rsv_qp on HIP08 Currently rsv_qp is freed before ib_unregister_device() is called on HIP08. During the time interval, users can still dereg MR and rsv_qp will be used in this process, leading to a UAF. Move the release of rsv_qp after calling ib_unregister_device() to fix it. Fixes: 70f92521584f ("RDMA/hns: Use the reserved loopback QPs to free MR before destroying MPT") Signed-off-by: wenglianfa Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20240906093444.3571619-3-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index a166b476977f..2225c9cc6366 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2972,6 +2972,9 @@ err_llm_init_failed: static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev) { + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) + free_mr_exit(hr_dev); + hns_roce_function_clear(hr_dev); if (!hr_dev->is_vf) @@ -6951,9 +6954,6 @@ static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, hr_dev->state = HNS_ROCE_DEVICE_STATE_UNINIT; hns_roce_handle_device_err(hr_dev); - if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) - free_mr_exit(hr_dev); - hns_roce_exit(hr_dev); kfree(hr_dev->priv); ib_dealloc_device(&hr_dev->ib_dev); From d586628b169d14bbf36be64d2b3ec9d9d2fe0432 Mon Sep 17 00:00:00 2001 From: wenglianfa Date: Fri, 6 Sep 2024 17:34:39 +0800 Subject: [PATCH 67/99] RDMA/hns: Fix the overflow risk of hem_list_calc_ba_range() The max value of 'unit' and 'hop_num' is 2^24 and 2, so the value of 'step' may exceed the range of u32. Change the type of 'step' to u64. Fixes: 38389eaa4db1 ("RDMA/hns: Add mtr support for mixed multihop addressing") Signed-off-by: wenglianfa Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20240906093444.3571619-5-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hem.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index 02baa853a76c..42111f31b371 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -1041,9 +1041,9 @@ static bool hem_list_is_bottom_bt(int hopnum, int bt_level) * @bt_level: base address table level * @unit: ba entries per bt page */ -static u32 hem_list_calc_ba_range(int hopnum, int bt_level, int unit) +static u64 hem_list_calc_ba_range(int hopnum, int bt_level, int unit) { - u32 step; + u64 step; int max; int i; @@ -1079,7 +1079,7 @@ int hns_roce_hem_list_calc_root_ba(const struct hns_roce_buf_region *regions, { struct hns_roce_buf_region *r; int total = 0; - int step; + u64 step; int i; for (i = 0; i < region_cnt; i++) { @@ -1110,7 +1110,7 @@ static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev, int ret = 0; int max_ofs; int level; - u32 step; + u64 step; int end; if (hopnum <= 1) @@ -1147,7 +1147,7 @@ static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev, } start_aligned = (distance / step) * step + r->offset; - end = min_t(int, start_aligned + step - 1, max_ofs); + end = min_t(u64, start_aligned + step - 1, max_ofs); cur = hem_list_alloc_item(hr_dev, start_aligned, end, unit, true); if (!cur) { @@ -1235,7 +1235,7 @@ static int setup_middle_bt(struct hns_roce_dev *hr_dev, void *cpu_base, struct hns_roce_hem_item *hem, *temp_hem; int total = 0; int offset; - int step; + u64 step; step = hem_list_calc_ba_range(r->hopnum, 1, unit); if (step < 1) From 74d315b5af180220d561684d15897730135733a6 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Fri, 6 Sep 2024 17:34:40 +0800 Subject: [PATCH 68/99] RDMA/hns: Fix spin_unlock_irqrestore() called with IRQs enabled Fix missuse of spin_lock_irq()/spin_unlock_irq() when spin_lock_irqsave()/spin_lock_irqrestore() was hold. This was discovered through the lock debugging, and the corresponding log is as follows: raw_local_irq_restore() called with IRQs enabled WARNING: CPU: 96 PID: 2074 at kernel/locking/irqflag-debug.c:10 warn_bogus_irq_restore+0x30/0x40 ... Call trace: warn_bogus_irq_restore+0x30/0x40 _raw_spin_unlock_irqrestore+0x84/0xc8 add_qp_to_list+0x11c/0x148 [hns_roce_hw_v2] hns_roce_create_qp_common.constprop.0+0x240/0x780 [hns_roce_hw_v2] hns_roce_create_qp+0x98/0x160 [hns_roce_hw_v2] create_qp+0x138/0x258 ib_create_qp_kernel+0x50/0xe8 create_mad_qp+0xa8/0x128 ib_mad_port_open+0x218/0x448 ib_mad_init_device+0x70/0x1f8 add_client_context+0xfc/0x220 enable_device_and_get+0xd0/0x140 ib_register_device.part.0+0xf4/0x1c8 ib_register_device+0x34/0x50 hns_roce_register_device+0x174/0x3d0 [hns_roce_hw_v2] hns_roce_init+0xfc/0x2c0 [hns_roce_hw_v2] __hns_roce_hw_v2_init_instance+0x7c/0x1d0 [hns_roce_hw_v2] hns_roce_hw_v2_init_instance+0x9c/0x180 [hns_roce_hw_v2] Fixes: 9a4435375cd1 ("IB/hns: Add driver files for hns RoCE driver") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20240906093444.3571619-6-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_qp.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 1de384ce4d0e..6b03ba671ff8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -1460,19 +1460,19 @@ void hns_roce_lock_cqs(struct hns_roce_cq *send_cq, struct hns_roce_cq *recv_cq) __acquire(&send_cq->lock); __acquire(&recv_cq->lock); } else if (unlikely(send_cq != NULL && recv_cq == NULL)) { - spin_lock_irq(&send_cq->lock); + spin_lock(&send_cq->lock); __acquire(&recv_cq->lock); } else if (unlikely(send_cq == NULL && recv_cq != NULL)) { - spin_lock_irq(&recv_cq->lock); + spin_lock(&recv_cq->lock); __acquire(&send_cq->lock); } else if (send_cq == recv_cq) { - spin_lock_irq(&send_cq->lock); + spin_lock(&send_cq->lock); __acquire(&recv_cq->lock); } else if (send_cq->cqn < recv_cq->cqn) { - spin_lock_irq(&send_cq->lock); + spin_lock(&send_cq->lock); spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); } else { - spin_lock_irq(&recv_cq->lock); + spin_lock(&recv_cq->lock); spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING); } } @@ -1492,13 +1492,13 @@ void hns_roce_unlock_cqs(struct hns_roce_cq *send_cq, spin_unlock(&recv_cq->lock); } else if (send_cq == recv_cq) { __release(&recv_cq->lock); - spin_unlock_irq(&send_cq->lock); + spin_unlock(&send_cq->lock); } else if (send_cq->cqn < recv_cq->cqn) { spin_unlock(&recv_cq->lock); - spin_unlock_irq(&send_cq->lock); + spin_unlock(&send_cq->lock); } else { spin_unlock(&send_cq->lock); - spin_unlock_irq(&recv_cq->lock); + spin_unlock(&recv_cq->lock); } } From 4321feefa5501a746ebf6a7d8b59e6b955ae1860 Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Fri, 6 Sep 2024 17:34:41 +0800 Subject: [PATCH 69/99] RDMA/hns: Fix VF triggering PF reset in abnormal interrupt handler In abnormal interrupt handler, a PF reset will be triggered even if the device is a VF. It should be a VF reset. Fixes: 2b9acb9a97fe ("RDMA/hns: Add the process of AEQ overflow for hip08") Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20240906093444.3571619-7-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 2225c9cc6366..5483d04b3ab7 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -6198,6 +6198,7 @@ static irqreturn_t abnormal_interrupt_basic(struct hns_roce_dev *hr_dev, struct pci_dev *pdev = hr_dev->pci_dev; struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); const struct hnae3_ae_ops *ops = ae_dev->ops; + enum hnae3_reset_type reset_type; irqreturn_t int_work = IRQ_NONE; u32 int_en; @@ -6209,10 +6210,12 @@ static irqreturn_t abnormal_interrupt_basic(struct hns_roce_dev *hr_dev, roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, 1 << HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S); + reset_type = hr_dev->is_vf ? + HNAE3_VF_FUNC_RESET : HNAE3_FUNC_RESET; + /* Set reset level for reset_event() */ if (ops->set_default_reset_request) - ops->set_default_reset_request(ae_dev, - HNAE3_FUNC_RESET); + ops->set_default_reset_request(ae_dev, reset_type); if (ops->reset_event) ops->reset_event(pdev, NULL); From ce196f6297c7f3ab7780795e40efd6c521f60c8b Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Fri, 6 Sep 2024 17:34:42 +0800 Subject: [PATCH 70/99] RDMA/hns: Fix 1bit-ECC recovery address in non-4K OS The 1bit-ECC recovery address read from HW only contain bits 64:12, so it should be fixed left-shifted 12 bits when used. Currently, the driver will shift the address left by PAGE_SHIFT when used, which is wrong in non-4K OS. Fixes: 2de949abd6a5 ("RDMA/hns: Recover 1bit-ECC error of RAM on chip") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20240906093444.3571619-8-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 5483d04b3ab7..349b68d7e7db 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -6285,7 +6285,7 @@ static u64 fmea_get_ram_res_addr(u32 res_type, __le64 *data) res_type == ECC_RESOURCE_SCCC) return le64_to_cpu(*data); - return le64_to_cpu(*data) << PAGE_SHIFT; + return le64_to_cpu(*data) << HNS_HW_PAGE_SHIFT; } static int fmea_recover_others(struct hns_roce_dev *hr_dev, u32 res_type, From fe51f6254d81f5a69c31df16353d6539b2b51630 Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Fri, 6 Sep 2024 17:34:43 +0800 Subject: [PATCH 71/99] RDMA/hns: Optimize hem allocation performance When allocating MTT hem, for each hop level of each hem that is being allocated, the driver iterates the hem list to find out whether the bt page has been allocated in this hop level. If not, allocate a new one and splice it to the list. The time complexity is O(n^2) in worst cases. Currently the allocation for-loop uses 'unit' as the step size. This actually has taken into account the reuse of last-hop-level MTT bt pages by multiple buffer pages. Thus pages of last hop level will never have been allocated, so there is no need to iterate the hem list in last hop level. Removing this unnecessary iteration can reduce the time complexity to O(n). Fixes: 38389eaa4db1 ("RDMA/hns: Add mtr support for mixed multihop addressing") Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20240906093444.3571619-9-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hem.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index 42111f31b371..c7c167e2a045 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -1134,10 +1134,12 @@ static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev, /* config L1 bt to last bt and link them to corresponding parent */ for (level = 1; level < hopnum; level++) { - cur = hem_list_search_item(&mid_bt[level], offset); - if (cur) { - hem_ptrs[level] = cur; - continue; + if (!hem_list_is_bottom_bt(hopnum, level)) { + cur = hem_list_search_item(&mid_bt[level], offset); + if (cur) { + hem_ptrs[level] = cur; + continue; + } } step = hem_list_calc_ba_range(hopnum, level, unit); From e4ed570122544dc39ba953c99380a75a52c01db4 Mon Sep 17 00:00:00 2001 From: Zhang Zekun Date: Mon, 9 Sep 2024 20:14:07 +0800 Subject: [PATCH 72/99] IB/iser: Remove unused declaration in header file The definition of iser_finalize_rdma_unaligned_sg() has been removed since commit dd0107a08996 ("IB/iser: set block queue_virt_boundary"). Let's remove the unused declaration in header file. Signed-off-by: Zhang Zekun Link: https://patch.msgid.link/20240909121408.80079-2-zhangzekun11@huawei.com Reviewed-by: Kalesh AP Reviewed-by: Sagi Grimberg Acked-by: Max Gurtovoy Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/iser/iscsi_iser.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index 68429a5f796d..1d7ac24c4c00 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -507,10 +507,6 @@ void iser_task_rdma_finalize(struct iscsi_iser_task *task); void iser_free_rx_descriptors(struct iser_conn *iser_conn); -void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, - struct iser_data_buf *mem, - enum iser_data_dir cmd_dir); - int iser_reg_mem_fastreg(struct iscsi_iser_task *task, enum iser_data_dir dir, bool all_imm); From 9cd30319bbd497b9fac13e822f184b84fd50ef88 Mon Sep 17 00:00:00 2001 From: Zhang Zekun Date: Mon, 9 Sep 2024 20:14:08 +0800 Subject: [PATCH 73/99] IB/qib: Remove unused declarations in header file The definition of qib_rc_rnr_retry() has been removed since commit b4238e70579c ("IB/qib: Use new rdmavt timers"). Also, the definition of mr_rcu_callback() has been remove since commit 7c2e11fe2dbe ("IB/qib: Remove qp and mr functionality from qib"). So, let's remove the unused declartions. Signed-off-by: Zhang Zekun Link: https://patch.msgid.link/20240909121408.80079-3-zhangzekun11@huawei.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/qib/qib_verbs.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index 07548fac1d8e..408fe1ba74b9 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -303,8 +303,6 @@ int qib_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid); -void qib_rc_rnr_retry(unsigned long arg); - void qib_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr); int qib_post_ud_send(struct rvt_qp *qp, const struct ib_send_wr *wr); @@ -312,8 +310,6 @@ int qib_post_ud_send(struct rvt_qp *qp, const struct ib_send_wr *wr); void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr, int has_grh, void *data, u32 tlen, struct rvt_qp *qp); -void mr_rcu_callback(struct rcu_head *list); - void qib_migrate_qp(struct rvt_qp *qp); int qib_ruc_check_hdr(struct qib_ibport *ibp, struct ib_header *hdr, From f4ccc0a2a0c5977540f519588636b5bc81aae2db Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Mon, 9 Sep 2024 14:53:31 +0800 Subject: [PATCH 74/99] RDMA/hns: Fix restricted __le16 degrades to integer issue Fix sparse warnings: restricted __le16 degrades to integer. Fixes: 5a87279591a1 ("RDMA/hns: Support hns HW stats") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202409080508.g4mNSLwy-lkp@intel.com/ Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20240909065331.3950268-1-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 349b68d7e7db..24e906b9d3ae 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1681,8 +1681,8 @@ static int hns_roce_hw_v2_query_counter(struct hns_roce_dev *hr_dev, for (i = 0; i < HNS_ROCE_HW_CNT_TOTAL && i < *num_counters; i++) { bd_idx = i / CNT_PER_DESC; - if (!(desc[bd_idx].flag & HNS_ROCE_CMD_FLAG_NEXT) && - bd_idx != HNS_ROCE_HW_CNT_TOTAL / CNT_PER_DESC) + if (bd_idx != HNS_ROCE_HW_CNT_TOTAL / CNT_PER_DESC && + !(desc[bd_idx].flag & cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT))) break; cnt_data = (__le64 *)&desc[bd_idx].data[0]; From cef7dde8836ab09a3bfe96ada4f18ef2496eacc9 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Mon, 9 Sep 2024 13:04:57 +0300 Subject: [PATCH 75/99] net/mlx5: Expand mkey page size to support 6 bits Protect the usage of the 6th bit with the relevant capability to ensure we are using the new page sizes with FW that supports the bit extension. Signed-off-by: Michael Guralnik Link: https://patch.msgid.link/20240909100504.29797-2-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 27 ++++++++++++++++----------- drivers/infiniband/hw/mlx5/mr.c | 10 ++++------ drivers/infiniband/hw/mlx5/odp.c | 2 +- include/linux/mlx5/mlx5_ifc.h | 7 ++++--- 4 files changed, 25 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 5505eb70939b..1c96f209cda6 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -63,17 +63,6 @@ __mlx5_log_page_size_to_bitmap(unsigned int log_pgsz_bits, return GENMASK(largest_pg_shift, pgsz_shift); } -/* - * For mkc users, instead of a page_offset the command has a start_iova which - * specifies both the page_offset and the on-the-wire IOVA - */ -#define mlx5_umem_find_best_pgsz(umem, typ, log_pgsz_fld, pgsz_shift, iova) \ - ib_umem_find_best_pgsz(umem, \ - __mlx5_log_page_size_to_bitmap( \ - __mlx5_bit_sz(typ, log_pgsz_fld), \ - pgsz_shift), \ - iova) - static __always_inline unsigned long __mlx5_page_offset_to_bitmask(unsigned int page_offset_bits, unsigned int offset_shift) @@ -1724,4 +1713,20 @@ static inline u32 smi_to_native_portnum(struct mlx5_ib_dev *dev, u32 port) return (port - 1) / dev->num_ports + 1; } +/* + * For mkc users, instead of a page_offset the command has a start_iova which + * specifies both the page_offset and the on-the-wire IOVA + */ +static __always_inline unsigned long +mlx5_umem_mkc_find_best_pgsz(struct mlx5_ib_dev *dev, struct ib_umem *umem, + u64 iova) +{ + int page_size_bits = + MLX5_CAP_GEN_2(dev->mdev, umr_log_entity_size_5) ? 6 : 5; + unsigned long bitmap = + __mlx5_log_page_size_to_bitmap(page_size_bits, 0); + + return ib_umem_find_best_pgsz(umem, bitmap, iova); +} + #endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index d5b5cd73e20c..45d9dc9c6c8f 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1120,8 +1120,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, if (umem->is_dmabuf) page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); else - page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, - 0, iova); + page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova); if (WARN_ON(!page_size)) return ERR_PTR(-EINVAL); @@ -1426,8 +1425,8 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, mr = alloc_cacheable_mr(pd, umem, iova, access_flags, MLX5_MKC_ACCESS_MODE_MTT); } else { - unsigned int page_size = mlx5_umem_find_best_pgsz( - umem, mkc, log_page_size, 0, iova); + unsigned int page_size = + mlx5_umem_mkc_find_best_pgsz(dev, umem, iova); mutex_lock(&dev->slow_path_mutex); mr = reg_create(pd, umem, iova, access_flags, page_size, @@ -1745,8 +1744,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, if (!mlx5r_umr_can_load_pas(dev, new_umem->length)) return false; - *page_size = - mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova); + *page_size = mlx5_umem_mkc_find_best_pgsz(dev, new_umem, iova); if (WARN_ON(!*page_size)) return false; return (mr->mmkey.cache_ent->rb_key.ndescs) >= diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 44a3428ea342..221820874e7a 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -693,7 +693,7 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem); u32 xlt_flags = 0; int err; - unsigned int page_size; + unsigned long page_size; if (flags & MLX5_PF_FLAGS_ENABLE) xlt_flags |= MLX5_IB_UPD_XLT_ENABLE; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 970c9d8473ef..ec1117d4e441 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1988,7 +1988,9 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 migratable[0x1]; u8 reserved_at_81[0x11]; u8 query_vuid[0x1]; - u8 reserved_at_93[0xd]; + u8 reserved_at_93[0x5]; + u8 umr_log_entity_size_5[0x1]; + u8 reserved_at_99[0x7]; u8 max_reformat_insert_size[0x8]; u8 max_reformat_insert_offset[0x8]; @@ -4212,8 +4214,7 @@ struct mlx5_ifc_mkc_bits { u8 reserved_at_1c0[0x19]; u8 relaxed_ordering_read[0x1]; - u8 reserved_at_1d9[0x1]; - u8 log_page_size[0x5]; + u8 log_page_size[0x6]; u8 reserved_at_1e0[0x20]; }; From 6cd9171d04cff79abe78c166927ab8563bf95fe5 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Mon, 9 Sep 2024 13:04:58 +0300 Subject: [PATCH 76/99] net/mlx5: Expose HW bits for Memory scheme ODP Expose IFC bits to support the new memory scheme on demand paging. Change the macro reading odp capabilities to be able to read from the new IFC layout and align the code in upper layers to be compiled. Signed-off-by: Michael Guralnik Link: https://patch.msgid.link/20240909100504.29797-3-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/odp.c | 40 +++++++------ .../net/ethernet/mellanox/mlx5/core/main.c | 28 ++++----- include/linux/mlx5/device.h | 4 ++ include/linux/mlx5/mlx5_ifc.h | 57 +++++++++++++++---- 4 files changed, 86 insertions(+), 43 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 221820874e7a..300504bf79d7 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -332,46 +332,46 @@ static void internal_fill_odp_caps(struct mlx5_ib_dev *dev) else dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT); - if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, ud_odp_caps.send)) caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; - if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.srq_receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, ud_odp_caps.srq_receive)) caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.send)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.receive)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.write)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.read)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.atomic)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.srq_receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.srq_receive)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.send)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.send)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SEND; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.receive)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_RECV; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.write)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.write)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_WRITE; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.read)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.read)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_READ; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.atomic)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.atomic)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.srq_receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.srq_receive)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) && @@ -388,13 +388,17 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? pfault->wqe.wq_num : pfault->token; u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = {}; + void *info; int err; MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME); - MLX5_SET(page_fault_resume_in, in, page_fault_type, pfault->type); - MLX5_SET(page_fault_resume_in, in, token, pfault->token); - MLX5_SET(page_fault_resume_in, in, wq_number, wq_num); - MLX5_SET(page_fault_resume_in, in, error, !!error); + + info = MLX5_ADDR_OF(page_fault_resume_in, in, + page_fault_info.trans_page_fault_info); + MLX5_SET(trans_page_fault_info, info, page_fault_type, pfault->type); + MLX5_SET(trans_page_fault_info, info, fault_token, pfault->token); + MLX5_SET(trans_page_fault_info, info, wq_number, wq_num); + MLX5_SET(trans_page_fault_info, info, error, !!error); err = mlx5_cmd_exec_in(dev->mdev, page_fault_resume, in); if (err) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 5b7e6f4b5c7e..cc2aa46cff04 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -479,20 +479,20 @@ static int handle_hca_cap_odp(struct mlx5_core_dev *dev, void *set_ctx) } \ } while (0) - ODP_CAP_SET_MAX(dev, ud_odp_caps.srq_receive); - ODP_CAP_SET_MAX(dev, rc_odp_caps.srq_receive); - ODP_CAP_SET_MAX(dev, xrc_odp_caps.srq_receive); - ODP_CAP_SET_MAX(dev, xrc_odp_caps.send); - ODP_CAP_SET_MAX(dev, xrc_odp_caps.receive); - ODP_CAP_SET_MAX(dev, xrc_odp_caps.write); - ODP_CAP_SET_MAX(dev, xrc_odp_caps.read); - ODP_CAP_SET_MAX(dev, xrc_odp_caps.atomic); - ODP_CAP_SET_MAX(dev, dc_odp_caps.srq_receive); - ODP_CAP_SET_MAX(dev, dc_odp_caps.send); - ODP_CAP_SET_MAX(dev, dc_odp_caps.receive); - ODP_CAP_SET_MAX(dev, dc_odp_caps.write); - ODP_CAP_SET_MAX(dev, dc_odp_caps.read); - ODP_CAP_SET_MAX(dev, dc_odp_caps.atomic); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.ud_odp_caps.srq_receive); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.rc_odp_caps.srq_receive); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.xrc_odp_caps.srq_receive); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.xrc_odp_caps.send); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.xrc_odp_caps.receive); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.xrc_odp_caps.write); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.xrc_odp_caps.read); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.xrc_odp_caps.atomic); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.dc_odp_caps.srq_receive); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.dc_odp_caps.send); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.dc_odp_caps.receive); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.dc_odp_caps.write); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.dc_odp_caps.read); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.dc_odp_caps.atomic); if (!do_set) return 0; diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index ba875a619b97..bd081f276654 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1369,6 +1369,10 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_ODP(mdev, cap)\ MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, cap) +#define MLX5_CAP_ODP_SCHEME(mdev, cap) \ + MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, \ + transport_page_fault_scheme_cap.cap) + #define MLX5_CAP_ODP_MAX(mdev, cap)\ MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->max, cap) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index ec1117d4e441..3e3336bb9191 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1326,11 +1326,13 @@ struct mlx5_ifc_atomic_caps_bits { u8 reserved_at_e0[0x720]; }; -struct mlx5_ifc_odp_cap_bits { +struct mlx5_ifc_odp_scheme_cap_bits { u8 reserved_at_0[0x40]; u8 sig[0x1]; - u8 reserved_at_41[0x1f]; + u8 reserved_at_41[0x4]; + u8 page_prefetch[0x1]; + u8 reserved_at_46[0x1a]; u8 reserved_at_60[0x20]; @@ -1344,7 +1346,20 @@ struct mlx5_ifc_odp_cap_bits { struct mlx5_ifc_odp_per_transport_service_cap_bits dc_odp_caps; - u8 reserved_at_120[0x6E0]; + u8 reserved_at_120[0xe0]; +}; + +struct mlx5_ifc_odp_cap_bits { + struct mlx5_ifc_odp_scheme_cap_bits transport_page_fault_scheme_cap; + + struct mlx5_ifc_odp_scheme_cap_bits memory_page_fault_scheme_cap; + + u8 reserved_at_400[0x200]; + + u8 mem_page_fault[0x1]; + u8 reserved_at_601[0x1f]; + + u8 reserved_at_620[0x1e0]; }; struct mlx5_ifc_tls_cap_bits { @@ -2034,7 +2049,8 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 min_mkey_log_entity_size_fixed_buffer[0x5]; u8 ec_vf_vport_base[0x10]; - u8 reserved_at_3a0[0x10]; + u8 reserved_at_3a0[0xa]; + u8 max_mkey_log_entity_size_mtt[0x6]; u8 max_rqt_vhca_id[0x10]; u8 reserved_at_3c0[0x20]; @@ -7258,6 +7274,30 @@ struct mlx5_ifc_qp_2err_in_bits { u8 reserved_at_60[0x20]; }; +struct mlx5_ifc_trans_page_fault_info_bits { + u8 error[0x1]; + u8 reserved_at_1[0x4]; + u8 page_fault_type[0x3]; + u8 wq_number[0x18]; + + u8 reserved_at_20[0x8]; + u8 fault_token[0x18]; +}; + +struct mlx5_ifc_mem_page_fault_info_bits { + u8 error[0x1]; + u8 reserved_at_1[0xf]; + u8 fault_token_47_32[0x10]; + + u8 fault_token_31_0[0x20]; +}; + +union mlx5_ifc_page_fault_resume_in_page_fault_info_auto_bits { + struct mlx5_ifc_trans_page_fault_info_bits trans_page_fault_info; + struct mlx5_ifc_mem_page_fault_info_bits mem_page_fault_info; + u8 reserved_at_0[0x40]; +}; + struct mlx5_ifc_page_fault_resume_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; @@ -7274,13 +7314,8 @@ struct mlx5_ifc_page_fault_resume_in_bits { u8 reserved_at_20[0x10]; u8 op_mod[0x10]; - u8 error[0x1]; - u8 reserved_at_41[0x4]; - u8 page_fault_type[0x3]; - u8 wq_number[0x18]; - - u8 reserved_at_60[0x8]; - u8 token[0x18]; + union mlx5_ifc_page_fault_resume_in_page_fault_info_auto_bits + page_fault_info; }; struct mlx5_ifc_nop_out_bits { From 64c68385a39bb676c76da36164ab696e8da78842 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Mon, 9 Sep 2024 13:04:59 +0300 Subject: [PATCH 77/99] RDMA/mlx5: Add new ODP memory scheme eqe format Add new fields to support the new memory scheme page fault and extend the token field to u64 as in the new scheme the token is 48 bit. Signed-off-by: Michael Guralnik Link: https://patch.msgid.link/20240909100504.29797-4-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/odp.c | 48 +++++++++++++++++++------------- include/linux/mlx5/device.h | 22 ++++++++++++++- 2 files changed, 50 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 300504bf79d7..f01026d507a3 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -45,7 +45,7 @@ /* Contains the details of a pagefault. */ struct mlx5_pagefault { u32 bytes_committed; - u32 token; + u64 token; u8 event_subtype; u8 type; union { @@ -74,6 +74,14 @@ struct mlx5_pagefault { u32 rdma_op_len; u64 rdma_va; } rdma; + struct { + u64 va; + u32 mkey; + u32 fault_byte_count; + u32 prefetch_before_byte_count; + u32 prefetch_after_byte_count; + u8 flags; + } memory; }; struct mlx5_ib_pf_eq *eq; @@ -1273,7 +1281,7 @@ read_user: if (ret) mlx5_ib_err( dev, - "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %x\n", + "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %llx\n", ret, wqe_index, pfault->token); resolve_page_fault: @@ -1332,13 +1340,13 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, } else if (ret < 0 || pages_in_range(address, length) > ret) { mlx5_ib_page_fault_resume(dev, pfault, 1); if (ret != -ENOENT) - mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n", + mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%llx, type: 0x%x\n", ret, pfault->token, pfault->type); return; } mlx5_ib_page_fault_resume(dev, pfault, 0); - mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n", + mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%llx, type: 0x%x, prefetch_activated: %d\n", pfault->token, pfault->type, prefetch_activated); @@ -1354,7 +1362,7 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, prefetch_len, &bytes_committed, NULL); if (ret < 0 && ret != -EAGAIN) { - mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", + mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%llx, address: 0x%.16llx, length = 0x%.16x\n", ret, pfault->token, address, prefetch_len); } } @@ -1405,15 +1413,12 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) pf_eqe = &eqe->data.page_fault; pfault->event_subtype = eqe->sub_type; - pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed); - - mlx5_ib_dbg(eq->dev, - "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n", - eqe->sub_type, pfault->bytes_committed); switch (eqe->sub_type) { case MLX5_PFAULT_SUBTYPE_RDMA: /* RDMA based event */ + pfault->bytes_committed = + be32_to_cpu(pf_eqe->rdma.bytes_committed); pfault->type = be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24; pfault->token = @@ -1427,10 +1432,12 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) be32_to_cpu(pf_eqe->rdma.rdma_op_len); pfault->rdma.rdma_va = be64_to_cpu(pf_eqe->rdma.rdma_va); - mlx5_ib_dbg(eq->dev, - "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n", - pfault->type, pfault->token, - pfault->rdma.r_key); + mlx5_ib_dbg( + eq->dev, + "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x, type:0x%x, token: 0x%06llx, r_key: 0x%08x\n", + eqe->sub_type, pfault->bytes_committed, + pfault->type, pfault->token, + pfault->rdma.r_key); mlx5_ib_dbg(eq->dev, "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n", pfault->rdma.rdma_op_len, @@ -1439,6 +1446,8 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) case MLX5_PFAULT_SUBTYPE_WQE: /* WQE based event */ + pfault->bytes_committed = + be32_to_cpu(pf_eqe->wqe.bytes_committed); pfault->type = (be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7; pfault->token = @@ -1450,11 +1459,12 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) be16_to_cpu(pf_eqe->wqe.wqe_index); pfault->wqe.packet_size = be16_to_cpu(pf_eqe->wqe.packet_length); - mlx5_ib_dbg(eq->dev, - "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n", - pfault->type, pfault->token, - pfault->wqe.wq_num, - pfault->wqe.wqe_index); + mlx5_ib_dbg( + eq->dev, + "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x, type:0x%x, token: 0x%06llx, wq_num: 0x%06x, wqe_index: 0x%04x\n", + eqe->sub_type, pfault->bytes_committed, + pfault->type, pfault->token, pfault->wqe.wq_num, + pfault->wqe.wqe_index); break; default: diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index bd081f276654..154095256d0d 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -211,6 +211,7 @@ enum { enum { MLX5_PFAULT_SUBTYPE_WQE = 0, MLX5_PFAULT_SUBTYPE_RDMA = 1, + MLX5_PFAULT_SUBTYPE_MEMORY = 2, }; enum wqe_page_fault_type { @@ -646,10 +647,11 @@ struct mlx5_eqe_page_req { __be32 rsvd1[5]; }; +#define MEMORY_SCHEME_PAGE_FAULT_GRANULARITY 4096 struct mlx5_eqe_page_fault { - __be32 bytes_committed; union { struct { + __be32 bytes_committed; u16 reserved1; __be16 wqe_index; u16 reserved2; @@ -659,6 +661,7 @@ struct mlx5_eqe_page_fault { __be32 pftype_wq; } __packed wqe; struct { + __be32 bytes_committed; __be32 r_key; u16 reserved1; __be16 packet_length; @@ -666,6 +669,23 @@ struct mlx5_eqe_page_fault { __be64 rdma_va; __be32 pftype_token; } __packed rdma; + struct { + u8 flags; + u8 reserved1; + __be16 post_demand_fault_pages; + __be16 pre_demand_fault_pages; + __be16 token47_32; + __be32 token31_0; + /* + * FW changed from specifying the fault size in byte + * count to 4k pages granularity. The size specified + * in pages uses bits 31:12, to keep backward + * compatibility. + */ + __be32 demand_fault_pages; + __be32 mkey; + __be64 va; + } __packed memory; } __packed; } __packed; From 8c6d097d830f779fc1725fbaa1314f20a7a07b4b Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Mon, 9 Sep 2024 13:05:00 +0300 Subject: [PATCH 78/99] RDMA/mlx5: Enforce umem boundaries for explicit ODP page faults The new memory scheme page faults are requesting the driver to fetch additinal pages to the faulted memory access. This is done in order to prefetch pages before and after the area that got the page fault, assuming this will reduce the total amount of page faults. The driver should ensure it handles only the pages that are within the umem range. Signed-off-by: Michael Guralnik Link: https://patch.msgid.link/20240909100504.29797-5-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/odp.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index f01026d507a3..20ad2616bed0 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -748,24 +748,31 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, * >0: Number of pages mapped */ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, - u32 *bytes_mapped, u32 flags) + u32 *bytes_mapped, u32 flags, bool permissive_fault) { struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); - if (unlikely(io_virt < mr->ibmr.iova)) + if (unlikely(io_virt < mr->ibmr.iova) && !permissive_fault) return -EFAULT; if (mr->umem->is_dmabuf) return pagefault_dmabuf_mr(mr, bcnt, bytes_mapped, flags); if (!odp->is_implicit_odp) { + u64 offset = io_virt < mr->ibmr.iova ? 0 : io_virt - mr->ibmr.iova; u64 user_va; - if (check_add_overflow(io_virt - mr->ibmr.iova, - (u64)odp->umem.address, &user_va)) + if (check_add_overflow(offset, (u64)odp->umem.address, + &user_va)) return -EFAULT; - if (unlikely(user_va >= ib_umem_end(odp) || - ib_umem_end(odp) - user_va < bcnt)) + + if (permissive_fault) { + if (user_va < ib_umem_start(odp)) + user_va = ib_umem_start(odp); + if ((user_va + bcnt) > ib_umem_end(odp)) + bcnt = ib_umem_end(odp) - user_va; + } else if (unlikely(user_va >= ib_umem_end(odp) || + ib_umem_end(odp) - user_va < bcnt)) return -EFAULT; return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped, flags); @@ -872,7 +879,7 @@ next_mr: case MLX5_MKEY_MR: mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); - ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0); + ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0, false); if (ret < 0) goto end; @@ -1727,7 +1734,7 @@ static void mlx5_ib_prefetch_mr_work(struct work_struct *w) for (i = 0; i < work->num_sge; ++i) { ret = pagefault_mr(work->frags[i].mr, work->frags[i].io_virt, work->frags[i].length, &bytes_mapped, - work->pf_flags); + work->pf_flags, false); if (ret <= 0) continue; mlx5_update_odp_stats(work->frags[i].mr, prefetch, ret); @@ -1778,7 +1785,7 @@ static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd, if (IS_ERR(mr)) return PTR_ERR(mr); ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length, - &bytes_mapped, pf_flags); + &bytes_mapped, pf_flags, false); if (ret < 0) { mlx5r_deref_odp_mkey(&mr->mmkey); return ret; From 7f91510af938b4b308a3d716fd3dbc1b3614ca6d Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Mon, 9 Sep 2024 13:05:01 +0300 Subject: [PATCH 79/99] RDMA/mlx5: Split ODP mkey search logic Split the search for the ODP mkey when handling an rdma type page fault to a helper function, later to be used in other page fault types. Signed-off-by: Michael Guralnik Link: https://patch.msgid.link/20240909100504.29797-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/odp.c | 65 +++++++++++++++++++------------- 1 file changed, 39 insertions(+), 26 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 20ad2616bed0..05b92f4cac0e 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -819,6 +819,27 @@ static bool mkey_is_eq(struct mlx5_ib_mkey *mmkey, u32 key) return mmkey->key == key; } +static struct mlx5_ib_mkey *find_odp_mkey(struct mlx5_ib_dev *dev, u32 key) +{ + struct mlx5_ib_mkey *mmkey; + + xa_lock(&dev->odp_mkeys); + mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key)); + if (!mmkey) { + mmkey = ERR_PTR(-ENOENT); + goto out; + } + if (!mkey_is_eq(mmkey, key)) { + mmkey = ERR_PTR(-EFAULT); + goto out; + } + refcount_inc(&mmkey->usecount); +out: + xa_unlock(&dev->odp_mkeys); + + return mmkey; +} + /* * Handle a single data segment in a page-fault WQE or RDMA region. * @@ -846,32 +867,24 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, io_virt += *bytes_committed; bcnt -= *bytes_committed; - next_mr: - xa_lock(&dev->odp_mkeys); - mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key)); - if (!mmkey) { - xa_unlock(&dev->odp_mkeys); - mlx5_ib_dbg( - dev, - "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", - key); - if (bytes_mapped) - *bytes_mapped += bcnt; - /* - * The user could specify a SGL with multiple lkeys and only - * some of them are ODP. Treat the non-ODP ones as fully - * faulted. - */ - ret = 0; - goto end; - } - refcount_inc(&mmkey->usecount); - xa_unlock(&dev->odp_mkeys); - - if (!mkey_is_eq(mmkey, key)) { - mlx5_ib_dbg(dev, "failed to find mkey %x\n", key); - ret = -EFAULT; + mmkey = find_odp_mkey(dev, key); + if (IS_ERR(mmkey)) { + ret = PTR_ERR(mmkey); + if (ret == -ENOENT) { + mlx5_ib_dbg( + dev, + "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", + key); + if (bytes_mapped) + *bytes_mapped += bcnt; + /* + * The user could specify a SGL with multiple lkeys and + * only some of them are ODP. Treat the non-ODP ones as + * fully faulted. + */ + ret = 0; + } goto end; } @@ -966,7 +979,7 @@ next_mr: } end: - if (mmkey) + if (!IS_ERR(mmkey)) mlx5r_deref_odp_mkey(mmkey); while (head) { frame = head; From e4fda2320f8e6bfc74f01770eb95a31cb327cc09 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Mon, 9 Sep 2024 13:05:02 +0300 Subject: [PATCH 80/99] RDMA/mlx5: Add handling for memory scheme page fault events The memory scheme page fault event is a new approch in handling page fault on mkeys using the on-demand-paging feature. The major shift in handling the page fault in this scheme is that the HW is taking responsibilty for parsing the faulted mkey instead of the previous approach where the driver would read and parse the wqes and query the mkeys to get to the direct mkey that we need to handle. Therefore, the event we get from FW in this scheme will contain the direct mkey and address we need to handle and require much less work from driver. Additionally, to optimize performance, the FW can generate the event on a memory area that is larger than the faulted memory operation is requiring, to 'prefetch' memory that is around it and will likely be used soon. Unlike previous types of page fault, the memory page scheme fault does not always require a resume command after handling the page fault as the FW can post multiple events on same mkey and will set the 'last' flag only on the page fault that requires the resume command. Signed-off-by: Michael Guralnik Link: https://patch.msgid.link/20240909100504.29797-7-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/odp.c | 120 +++++++++++++++++++++++++++++-- 1 file changed, 114 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 05b92f4cac0e..841725557f2a 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -401,12 +401,24 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME); - info = MLX5_ADDR_OF(page_fault_resume_in, in, - page_fault_info.trans_page_fault_info); - MLX5_SET(trans_page_fault_info, info, page_fault_type, pfault->type); - MLX5_SET(trans_page_fault_info, info, fault_token, pfault->token); - MLX5_SET(trans_page_fault_info, info, wq_number, wq_num); - MLX5_SET(trans_page_fault_info, info, error, !!error); + if (pfault->event_subtype == MLX5_PFAULT_SUBTYPE_MEMORY) { + info = MLX5_ADDR_OF(page_fault_resume_in, in, + page_fault_info.mem_page_fault_info); + MLX5_SET(mem_page_fault_info, info, fault_token_31_0, + pfault->token & 0xffffffff); + MLX5_SET(mem_page_fault_info, info, fault_token_47_32, + (pfault->token >> 32) & 0xffff); + MLX5_SET(mem_page_fault_info, info, error, !!error); + } else { + info = MLX5_ADDR_OF(page_fault_resume_in, in, + page_fault_info.trans_page_fault_info); + MLX5_SET(trans_page_fault_info, info, page_fault_type, + pfault->type); + MLX5_SET(trans_page_fault_info, info, fault_token, + pfault->token); + MLX5_SET(trans_page_fault_info, info, wq_number, wq_num); + MLX5_SET(trans_page_fault_info, info, error, !!error); + } err = mlx5_cmd_exec_in(dev->mdev, page_fault_resume, in); if (err) @@ -1388,6 +1400,63 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, } } +#define MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST BIT(7) +static void mlx5_ib_mr_memory_pfault_handler(struct mlx5_ib_dev *dev, + struct mlx5_pagefault *pfault) +{ + u64 prefetch_va = + pfault->memory.va - pfault->memory.prefetch_before_byte_count; + size_t prefetch_size = pfault->memory.prefetch_before_byte_count + + pfault->memory.fault_byte_count + + pfault->memory.prefetch_after_byte_count; + struct mlx5_ib_mkey *mmkey; + struct mlx5_ib_mr *mr; + int ret = 0; + + mmkey = find_odp_mkey(dev, pfault->memory.mkey); + if (IS_ERR(mmkey)) + goto err; + + mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); + + /* If prefetch fails, handle only demanded page fault */ + ret = pagefault_mr(mr, prefetch_va, prefetch_size, NULL, 0, true); + if (ret < 0) { + ret = pagefault_mr(mr, pfault->memory.va, + pfault->memory.fault_byte_count, NULL, 0, + true); + if (ret < 0) + goto err; + } + + mlx5_update_odp_stats(mr, faults, ret); + mlx5r_deref_odp_mkey(mmkey); + + if (pfault->memory.flags & MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST) + mlx5_ib_page_fault_resume(dev, pfault, 0); + + mlx5_ib_dbg( + dev, + "PAGE FAULT completed %s. token 0x%llx, mkey: 0x%x, va: 0x%llx, byte_count: 0x%x\n", + pfault->memory.flags & MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST ? + "" : + "without resume cmd", + pfault->token, pfault->memory.mkey, pfault->memory.va, + pfault->memory.fault_byte_count); + + return; + +err: + if (!IS_ERR(mmkey)) + mlx5r_deref_odp_mkey(mmkey); + mlx5_ib_page_fault_resume(dev, pfault, 1); + mlx5_ib_dbg( + dev, + "PAGE FAULT error. token 0x%llx, mkey: 0x%x, va: 0x%llx, byte_count: 0x%x, err: %d\n", + pfault->token, pfault->memory.mkey, pfault->memory.va, + pfault->memory.fault_byte_count, ret); +} + static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault) { u8 event_subtype = pfault->event_subtype; @@ -1399,6 +1468,9 @@ static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfaul case MLX5_PFAULT_SUBTYPE_RDMA: mlx5_ib_mr_rdma_pfault_handler(dev, pfault); break; + case MLX5_PFAULT_SUBTYPE_MEMORY: + mlx5_ib_mr_memory_pfault_handler(dev, pfault); + break; default: mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n", event_subtype); @@ -1417,6 +1489,7 @@ static void mlx5_ib_eqe_pf_action(struct work_struct *work) mempool_free(pfault, eq->pool); } +#define MEMORY_SCHEME_PAGE_FAULT_GRANULARITY 4096 static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) { struct mlx5_eqe_page_fault *pf_eqe; @@ -1487,6 +1560,41 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) pfault->wqe.wqe_index); break; + case MLX5_PFAULT_SUBTYPE_MEMORY: + /* Memory based event */ + pfault->bytes_committed = 0; + pfault->token = + be32_to_cpu(pf_eqe->memory.token31_0) | + ((u64)be16_to_cpu(pf_eqe->memory.token47_32) + << 32); + pfault->memory.va = be64_to_cpu(pf_eqe->memory.va); + pfault->memory.mkey = be32_to_cpu(pf_eqe->memory.mkey); + pfault->memory.fault_byte_count = (be32_to_cpu( + pf_eqe->memory.demand_fault_pages) >> 12) * + MEMORY_SCHEME_PAGE_FAULT_GRANULARITY; + pfault->memory.prefetch_before_byte_count = + be16_to_cpu( + pf_eqe->memory.pre_demand_fault_pages) * + MEMORY_SCHEME_PAGE_FAULT_GRANULARITY; + pfault->memory.prefetch_after_byte_count = + be16_to_cpu( + pf_eqe->memory.post_demand_fault_pages) * + MEMORY_SCHEME_PAGE_FAULT_GRANULARITY; + pfault->memory.flags = pf_eqe->memory.flags; + mlx5_ib_dbg( + eq->dev, + "PAGE_FAULT: subtype: 0x%02x, token: 0x%06llx, mkey: 0x%06x, fault_byte_count: 0x%06x, va: 0x%016llx, flags: 0x%02x\n", + eqe->sub_type, pfault->token, + pfault->memory.mkey, + pfault->memory.fault_byte_count, + pfault->memory.va, pfault->memory.flags); + mlx5_ib_dbg( + eq->dev, + "PAGE_FAULT: prefetch size: before: 0x%06x, after 0x%06x\n", + pfault->memory.prefetch_before_byte_count, + pfault->memory.prefetch_after_byte_count); + break; + default: mlx5_ib_warn(eq->dev, "Unsupported page fault event sub-type: 0x%02hhx\n", From 6f2487bfafce5e6cd6f89e7238a82012f7b9f5ac Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Mon, 9 Sep 2024 13:05:03 +0300 Subject: [PATCH 81/99] RDMA/mlx5: Add implicit MR handling to ODP memory scheme Implicit MRs in ODP memory scheme require allocating a private null mkey and assigning the mkey and va differently in the KSM mkey. The page faults are received on the null mkey so we also add storing the null mkey in the odp_mkey xarray. Signed-off-by: Michael Guralnik Link: https://patch.msgid.link/20240909100504.29797-8-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 3 + drivers/infiniband/hw/mlx5/odp.c | 116 +++++++++++++++++++++++++-- 2 files changed, 111 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 1c96f209cda6..59ce407ce505 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -629,6 +629,8 @@ enum mlx5_mkey_type { MLX5_MKEY_MR = 1, MLX5_MKEY_MW, MLX5_MKEY_INDIRECT_DEVX, + MLX5_MKEY_NULL, + MLX5_MKEY_IMPLICIT_CHILD, }; struct mlx5r_cache_rb_key { @@ -714,6 +716,7 @@ struct mlx5_ib_mr { struct mlx5_ib_mr *dd_crossed_mr; struct list_head dd_node; u8 revoked :1; + struct mlx5_ib_mkey null_mmkey; }; }; }; diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 841725557f2a..4b37446758fd 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -107,13 +107,20 @@ static u64 mlx5_imr_ksm_entries; static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, struct mlx5_ib_mr *imr, int flags) { + struct mlx5_core_dev *dev = mr_to_mdev(imr)->mdev; struct mlx5_klm *end = pklm + nentries; + int step = MLX5_CAP_ODP(dev, mem_page_fault) ? MLX5_IMR_MTT_SIZE : 0; + __be32 key = MLX5_CAP_ODP(dev, mem_page_fault) ? + cpu_to_be32(imr->null_mmkey.key) : + mr_to_mdev(imr)->mkeys.null_mkey; + u64 va = + MLX5_CAP_ODP(dev, mem_page_fault) ? idx * MLX5_IMR_MTT_SIZE : 0; if (flags & MLX5_IB_UPD_XLT_ZAP) { - for (; pklm != end; pklm++, idx++) { + for (; pklm != end; pklm++, idx++, va += step) { pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); - pklm->key = mr_to_mdev(imr)->mkeys.null_mkey; - pklm->va = 0; + pklm->key = key; + pklm->va = cpu_to_be64(va); } return; } @@ -137,7 +144,7 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, */ lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex); - for (; pklm != end; pklm++, idx++) { + for (; pklm != end; pklm++, idx++, va += step) { struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx); pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); @@ -145,8 +152,8 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, pklm->key = cpu_to_be32(mtt->ibmr.lkey); pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE); } else { - pklm->key = mr_to_mdev(imr)->mkeys.null_mkey; - pklm->va = 0; + pklm->key = key; + pklm->va = cpu_to_be64(va); } } } @@ -225,6 +232,9 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) return; xa_erase(&imr->implicit_children, idx); + if (MLX5_CAP_ODP(mr_to_mdev(mr)->mdev, mem_page_fault)) + xa_erase(&mr_to_mdev(mr)->odp_mkeys, + mlx5_base_mkey(mr->mmkey.key)); /* Freeing a MR is a sleeping operation, so bounce to a work queue */ INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work); @@ -492,6 +502,16 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, } xa_unlock(&imr->implicit_children); + if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) { + ret = xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), + &mr->mmkey, GFP_KERNEL); + if (xa_is_err(ret)) { + ret = ERR_PTR(xa_err(ret)); + xa_erase(&imr->implicit_children, idx); + goto out_mr; + } + mr->mmkey.type = MLX5_MKEY_IMPLICIT_CHILD; + } mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr); return mr; @@ -502,6 +522,57 @@ out_mr: return ret; } +/* + * When using memory scheme ODP, implicit MRs can't use the reserved null mkey + * and each implicit MR needs to assign a private null mkey to get the page + * faults on. + * The null mkey is created with the properties to enable getting the page + * fault for every time it is accessed and having all relevant access flags. + */ +static int alloc_implicit_mr_null_mkey(struct mlx5_ib_dev *dev, + struct mlx5_ib_mr *imr, + struct mlx5_ib_pd *pd) +{ + size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + 64; + void *mkc; + u32 *in; + int err; + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 4); + MLX5_SET(create_mkey_in, in, pg_access, 1); + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, a, 1); + MLX5_SET(mkc, mkc, rw, 1); + MLX5_SET(mkc, mkc, rr, 1); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, free, 0); + MLX5_SET(mkc, mkc, umr_en, 0); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); + + MLX5_SET(mkc, mkc, translations_octword_size, 4); + MLX5_SET(mkc, mkc, log_page_size, 61); + MLX5_SET(mkc, mkc, length64, 1); + MLX5_SET(mkc, mkc, pd, pd->pdn); + MLX5_SET64(mkc, mkc, start_addr, 0); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + + err = mlx5_core_create_mkey(dev->mdev, &imr->null_mmkey.key, in, inlen); + if (err) + goto free_in; + + imr->null_mmkey.type = MLX5_MKEY_NULL; + +free_in: + kfree(in); + return err; +} + struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, int access_flags) { @@ -534,6 +605,16 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, imr->is_odp_implicit = true; xa_init(&imr->implicit_children); + if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) { + err = alloc_implicit_mr_null_mkey(dev, imr, pd); + if (err) + goto out_mr; + + err = mlx5r_store_odp_mkey(dev, &imr->null_mmkey); + if (err) + goto out_mr; + } + err = mlx5r_umr_update_xlt(imr, 0, mlx5_imr_ksm_entries, MLX5_KSM_PAGE_SHIFT, @@ -568,6 +649,14 @@ void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr) xa_erase(&mr->implicit_children, idx); mlx5_ib_dereg_mr(&mtt->ibmr, NULL); } + + if (mr->null_mmkey.key) { + xa_erase(&mr_to_mdev(mr)->odp_mkeys, + mlx5_base_mkey(mr->null_mmkey.key)); + + mlx5_core_destroy_mkey(mr_to_mdev(mr)->mdev, + mr->null_mmkey.key); + } } #define MLX5_PF_FLAGS_DOWNGRADE BIT(1) @@ -1410,14 +1499,25 @@ static void mlx5_ib_mr_memory_pfault_handler(struct mlx5_ib_dev *dev, pfault->memory.fault_byte_count + pfault->memory.prefetch_after_byte_count; struct mlx5_ib_mkey *mmkey; - struct mlx5_ib_mr *mr; + struct mlx5_ib_mr *mr, *child_mr; int ret = 0; mmkey = find_odp_mkey(dev, pfault->memory.mkey); if (IS_ERR(mmkey)) goto err; - mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); + switch (mmkey->type) { + case MLX5_MKEY_IMPLICIT_CHILD: + child_mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); + mr = child_mr->parent; + break; + case MLX5_MKEY_NULL: + mr = container_of(mmkey, struct mlx5_ib_mr, null_mmkey); + break; + default: + mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); + break; + } /* If prefetch fails, handle only demanded page fault */ ret = pagefault_mr(mr, prefetch_va, prefetch_size, NULL, 0, true); From 907936b6f4e630718cc31ddea79cc76a3e32080a Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Mon, 9 Sep 2024 13:05:04 +0300 Subject: [PATCH 82/99] net/mlx5: Handle memory scheme ODP capabilities When running over new FW that supports the new memory scheme ODP, set the cap in the FW to signal the FW we are working in the new scheme. In the memory scheme ODP the per_transport_service capabilities are RO for the driver so we skip their setting. Signed-off-by: Michael Guralnik Link: https://patch.msgid.link/20240909100504.29797-9-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky --- .../net/ethernet/mellanox/mlx5/core/main.c | 22 +++++++++++++++---- include/linux/mlx5/device.h | 10 ++++++--- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index cc2aa46cff04..4ec6507d094a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -454,8 +454,8 @@ static int handle_hca_cap_atomic(struct mlx5_core_dev *dev, void *set_ctx) static int handle_hca_cap_odp(struct mlx5_core_dev *dev, void *set_ctx) { + bool do_set = false, mem_page_fault = false; void *set_hca_cap; - bool do_set = false; int err; if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) || @@ -470,6 +470,17 @@ static int handle_hca_cap_odp(struct mlx5_core_dev *dev, void *set_ctx) memcpy(set_hca_cap, dev->caps.hca[MLX5_CAP_ODP]->cur, MLX5_ST_SZ_BYTES(odp_cap)); + /* For best performance, enable memory scheme ODP only when + * it has page prefetch enabled. + */ + if (MLX5_CAP_ODP_MAX(dev, mem_page_fault) && + MLX5_CAP_ODP_MAX(dev, memory_page_fault_scheme_cap.page_prefetch)) { + mem_page_fault = true; + do_set = true; + MLX5_SET(odp_cap, set_hca_cap, mem_page_fault, mem_page_fault); + goto set; + } + #define ODP_CAP_SET_MAX(dev, field) \ do { \ u32 _res = MLX5_CAP_ODP_MAX(dev, field); \ @@ -494,10 +505,13 @@ static int handle_hca_cap_odp(struct mlx5_core_dev *dev, void *set_ctx) ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.dc_odp_caps.read); ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.dc_odp_caps.atomic); - if (!do_set) - return 0; +set: + if (do_set) + err = set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_ODP); - return set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_ODP); + mlx5_core_dbg(dev, "Using ODP %s scheme\n", + mem_page_fault ? "memory" : "transport"); + return err; } static int max_uc_list_get_devlink_param(struct mlx5_core_dev *dev) diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 154095256d0d..57c9b18c3adb 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1389,9 +1389,13 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_ODP(mdev, cap)\ MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, cap) -#define MLX5_CAP_ODP_SCHEME(mdev, cap) \ - MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, \ - transport_page_fault_scheme_cap.cap) +#define MLX5_CAP_ODP_SCHEME(mdev, cap) \ + (MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, \ + mem_page_fault) ? \ + MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, \ + memory_page_fault_scheme_cap.cap) : \ + MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, \ + transport_page_fault_scheme_cap.cap)) #define MLX5_CAP_ODP_MAX(mdev, cap)\ MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->max, cap) From c77aec65e828bd82726f664585e3bb425d17be7f Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Mon, 9 Sep 2024 21:47:33 +0300 Subject: [PATCH 83/99] RDMA/mlx5: Consider the query_vuid cap for data_direct Consider also the query_vuid cap before enabling the data_direct functionality. This may prevent a syndrome from the FW in case the query_vuid command is not supported. (e.g. migratable VF) Signed-off-by: Yishai Hadas Reviewed-by: Gal Shalom Link: https://patch.msgid.link/274c4f6f1ac0b1078243dd296695a49dbe58e7d1.1725907637.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index b85ad3c0bfa1..af9ccae684ba 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3479,7 +3479,8 @@ static int mlx5_ib_data_direct_init(struct mlx5_ib_dev *dev) char vuid[MLX5_ST_SZ_BYTES(array1024_auto) + 1] = {}; int ret; - if (!MLX5_CAP_GEN(dev->mdev, data_direct)) + if (!MLX5_CAP_GEN(dev->mdev, data_direct) || + !MLX5_CAP_GEN_2(dev->mdev, query_vuid)) return 0; ret = mlx5_cmd_query_vuid(dev->mdev, true, vuid); @@ -3500,7 +3501,8 @@ static int mlx5_ib_data_direct_init(struct mlx5_ib_dev *dev) static void mlx5_ib_data_direct_cleanup(struct mlx5_ib_dev *dev) { - if (!MLX5_CAP_GEN(dev->mdev, data_direct)) + if (!MLX5_CAP_GEN(dev->mdev, data_direct) || + !MLX5_CAP_GEN_2(dev->mdev, query_vuid)) return; mlx5_data_direct_ib_unreg(dev); From 303ee44ac4b98196a4a311c670d3db0dc38cee84 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Mon, 9 Sep 2024 20:30:19 +0300 Subject: [PATCH 84/99] RDMA/mlx5: Check RoCE LAG status before getting netdev Check if RoCE LAG is active before calling the LAG layer for netdev. This clarifies if LAG is active. No behavior changes with this patch. Signed-off-by: Mark Bloch Signed-off-by: Michael Guralnik Link: https://patch.msgid.link/20240909173025.30422-2-michaelgur@nvidia.com Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index af9ccae684ba..f09900435aa8 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -198,12 +198,18 @@ static int mlx5_netdev_event(struct notifier_block *this, case NETDEV_CHANGE: case NETDEV_UP: case NETDEV_DOWN: { - struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(mdev); struct net_device *upper = NULL; - if (lag_ndev) { - upper = netdev_master_upper_dev_get(lag_ndev); - dev_put(lag_ndev); + if (mlx5_lag_is_roce(mdev)) { + struct net_device *lag_ndev; + + lag_ndev = mlx5_lag_get_roce_netdev(mdev); + if (lag_ndev) { + upper = netdev_master_upper_dev_get(lag_ndev); + dev_put(lag_ndev); + } else { + goto done; + } } if (ibdev->is_rep) @@ -257,9 +263,10 @@ static struct net_device *mlx5_ib_get_netdev(struct ib_device *device, if (!mdev) return NULL; - ndev = mlx5_lag_get_roce_netdev(mdev); - if (ndev) + if (mlx5_lag_is_roce(mdev)) { + ndev = mlx5_lag_get_roce_netdev(mdev); goto out; + } /* Ensure ndev does not disappear before we invoke dev_hold() */ From 3ed7f9e239938a0cfaf3689e2f545229ecabec06 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Mon, 9 Sep 2024 20:30:20 +0300 Subject: [PATCH 85/99] RDMA/mlx5: Obtain upper net device only when needed Report the upper device's state as the RDMA port state only in RoCE LAG or switchdev LAG. Fixes: 27f9e0ccb6da ("net/mlx5: Lag, Add single RDMA device in multiport mode") Signed-off-by: Mark Bloch Signed-off-by: Michael Guralnik Link: https://patch.msgid.link/20240909173025.30422-3-michaelgur@nvidia.com Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index f09900435aa8..e78d50c74bc8 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -558,7 +558,7 @@ static int mlx5_query_port_roce(struct ib_device *device, u32 port_num, if (!ndev) goto out; - if (dev->lag_active) { + if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) { rcu_read_lock(); upper = netdev_master_upper_dev_get_rcu(ndev); if (upper) { From 91b4b2c62613dab3e1ec8083f1e96e5f5b2eee36 Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Mon, 9 Sep 2024 20:30:21 +0300 Subject: [PATCH 86/99] RDMA/mlx5: Initialize phys_port_cnt earlier in RDMA device creation phys_port_cnt of the IB device must be initialized before calling ib_device_set_netdev(). Previously, phys_port_cnt was initialized in the mlx5_ib init function. Remove this initialization to allow setting it separately, providing the flexibility to call ib_device_set_netdev before registering the IB device. Signed-off-by: Chiara Meiohas Signed-off-by: Michael Guralnik Link: https://patch.msgid.link/20240909173025.30422-4-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/ib_rep.c | 1 + drivers/infiniband/hw/mlx5/main.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index c7a4ee896121..1ad934685d80 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -104,6 +104,7 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) ibdev->is_rep = true; vport_index = rep->vport_index; ibdev->port[vport_index].rep = rep; + ibdev->ib_dev.phys_port_cnt = num_ports; ibdev->port[vport_index].roce.netdev = mlx5_ib_get_rep_netdev(lag_master->priv.eswitch, rep->vport); ibdev->mdev = lag_master; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index e78d50c74bc8..ad8a2b5517bf 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3934,7 +3934,6 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) dev->ib_dev.node_type = RDMA_NODE_IB_CA; dev->ib_dev.local_dma_lkey = 0 /* not supported for now */; - dev->ib_dev.phys_port_cnt = dev->num_ports; dev->ib_dev.dev.parent = mdev->device; dev->ib_dev.lag_flags = RDMA_LAG_FLAGS_HASH_ALL_SLAVES; @@ -4649,6 +4648,7 @@ static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent, mplane->mdev = mparent->mdev; mplane->num_ports = mparent->num_plane; mplane->sub_dev_name = name; + mplane->ib_dev.phys_port_cnt = mplane->num_ports; ret = __mlx5_ib_add(mplane, &plane_profile); if (ret) @@ -4765,6 +4765,7 @@ static int mlx5r_probe(struct auxiliary_device *adev, dev->mdev = mdev; dev->num_ports = num_ports; + dev->ib_dev.phys_port_cnt = num_ports; if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_get_roce_state(mdev)) profile = &raw_eth_profile; From 5f8ca04fdd3c66a322ea318b5f1cb684dd56e5b2 Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Mon, 9 Sep 2024 20:30:22 +0300 Subject: [PATCH 87/99] RDMA/device: Remove optimization in ib_device_get_netdev() The caller of ib_device_get_netdev() relies on its result to accurately match a given netdev with the ib device associated netdev. ib_device_get_netdev returns NULL when the IB device associated netdev is unregistering, preventing the caller of matching netdevs properly. Thus, remove this optimization and return the netdev even if it is undergoing unregistration, allowing matching by the caller. This change ensures proper netdev matching and reference count handling by the caller of ib_device_get_netdev/ib_device_set_netdev API. Signed-off-by: Maher Sanalla Signed-off-by: Chiara Meiohas Signed-off-by: Michael Guralnik Link: https://patch.msgid.link/20240909173025.30422-5-michaelgur@nvidia.com Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 0290aca18d26..b1377503cb9d 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2252,15 +2252,6 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, spin_unlock(&pdata->netdev_lock); } - /* - * If we are starting to unregister expedite things by preventing - * propagation of an unregistering netdev. - */ - if (res && res->reg_state != NETREG_REGISTERED) { - dev_put(res); - return NULL; - } - return res; } From 8d159eb2117b2e3697a31785662b653938f007cb Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Mon, 9 Sep 2024 20:30:23 +0300 Subject: [PATCH 88/99] RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas Signed-off-by: Michael Guralnik Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 4 + drivers/infiniband/hw/mlx5/ib_rep.c | 23 +-- drivers/infiniband/hw/mlx5/main.c | 183 ++++++++++++------ drivers/infiniband/hw/mlx5/mlx5_ib.h | 3 +- .../net/ethernet/mellanox/mlx5/core/lag/lag.c | 76 ++++---- include/linux/mlx5/device.h | 1 + include/linux/mlx5/driver.h | 2 +- include/rdma/ib_verbs.h | 2 + 8 files changed, 191 insertions(+), 103 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index b1377503cb9d..9e765c79a892 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2236,6 +2236,9 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, if (!rdma_is_port_valid(ib_dev, port)) return NULL; + if (!ib_dev->port_data) + return NULL; + pdata = &ib_dev->port_data[port]; /* @@ -2254,6 +2257,7 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, return res; } +EXPORT_SYMBOL(ib_device_get_netdev); /** * ib_device_get_by_netdev - Find an IB device associated with a netdev diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index 1ad934685d80..49af1cfbe6d1 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -13,6 +13,7 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, int vport_index) { struct mlx5_ib_dev *ibdev; + struct net_device *ndev; ibdev = mlx5_eswitch_uplink_get_proto_dev(dev->priv.eswitch, REP_IB); if (!ibdev) @@ -20,12 +21,9 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, ibdev->port[vport_index].rep = rep; rep->rep_data[REP_IB].priv = ibdev; - write_lock(&ibdev->port[vport_index].roce.netdev_lock); - ibdev->port[vport_index].roce.netdev = - mlx5_ib_get_rep_netdev(rep->esw, rep->vport); - write_unlock(&ibdev->port[vport_index].roce.netdev_lock); + ndev = mlx5_ib_get_rep_netdev(rep->esw, rep->vport); - return 0; + return ib_device_set_netdev(&ibdev->ib_dev, ndev, vport_index + 1); } static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev); @@ -104,11 +102,15 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) ibdev->is_rep = true; vport_index = rep->vport_index; ibdev->port[vport_index].rep = rep; - ibdev->ib_dev.phys_port_cnt = num_ports; - ibdev->port[vport_index].roce.netdev = - mlx5_ib_get_rep_netdev(lag_master->priv.eswitch, rep->vport); ibdev->mdev = lag_master; ibdev->num_ports = num_ports; + ibdev->ib_dev.phys_port_cnt = num_ports; + ret = ib_device_set_netdev(&ibdev->ib_dev, + mlx5_ib_get_rep_netdev(lag_master->priv.eswitch, + rep->vport), + vport_index + 1); + if (ret) + goto fail_add; ret = __mlx5_ib_add(ibdev, profile); if (ret) @@ -161,9 +163,8 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep) } port = &dev->port[vport_index]; - write_lock(&port->roce.netdev_lock); - port->roce.netdev = NULL; - write_unlock(&port->roce.netdev_lock); + + ib_device_set_netdev(&dev->ib_dev, NULL, vport_index + 1); rep->rep_data[REP_IB].priv = NULL; port->rep = NULL; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index ad8a2b5517bf..4999239c8f41 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -147,16 +147,52 @@ static struct mlx5_roce *mlx5_get_rep_roce(struct mlx5_ib_dev *dev, if (upper && port->rep->vport == MLX5_VPORT_UPLINK) continue; - - read_lock(&port->roce.netdev_lock); - rep_ndev = mlx5_ib_get_rep_netdev(port->rep->esw, - port->rep->vport); - if (rep_ndev == ndev) { - read_unlock(&port->roce.netdev_lock); + rep_ndev = ib_device_get_netdev(&dev->ib_dev, i + 1); + if (rep_ndev && rep_ndev == ndev) { + dev_put(rep_ndev); *port_num = i + 1; return &port->roce; } - read_unlock(&port->roce.netdev_lock); + + dev_put(rep_ndev); + } + + return NULL; +} + +static bool mlx5_netdev_send_event(struct mlx5_ib_dev *dev, + struct net_device *ndev, + struct net_device *upper, + struct net_device *ib_ndev) +{ + if (!dev->ib_active) + return false; + + /* Event is about our upper device */ + if (upper == ndev) + return true; + + /* RDMA device is not in lag and not in switchdev */ + if (!dev->is_rep && !upper && ndev == ib_ndev) + return true; + + /* RDMA devie is in switchdev */ + if (dev->is_rep && ndev == ib_ndev) + return true; + + return false; +} + +static struct net_device *mlx5_ib_get_rep_uplink_netdev(struct mlx5_ib_dev *ibdev) +{ + struct mlx5_ib_port *port; + int i; + + for (i = 0; i < ibdev->num_ports; i++) { + port = &ibdev->port[i]; + if (port->rep && port->rep->vport == MLX5_VPORT_UPLINK) { + return ib_device_get_netdev(&ibdev->ib_dev, i + 1); + } } return NULL; @@ -168,6 +204,7 @@ static int mlx5_netdev_event(struct notifier_block *this, struct mlx5_roce *roce = container_of(this, struct mlx5_roce, nb); struct net_device *ndev = netdev_notifier_info_to_dev(ptr); u32 port_num = roce->native_port_num; + struct net_device *ib_ndev = NULL; struct mlx5_core_dev *mdev; struct mlx5_ib_dev *ibdev; @@ -181,29 +218,38 @@ static int mlx5_netdev_event(struct notifier_block *this, /* Should already be registered during the load */ if (ibdev->is_rep) break; - write_lock(&roce->netdev_lock); + + ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num); + /* Exit if already registered */ + if (ib_ndev) + goto put_ndev; + if (ndev->dev.parent == mdev->device) - roce->netdev = ndev; - write_unlock(&roce->netdev_lock); + ib_device_set_netdev(&ibdev->ib_dev, ndev, port_num); break; case NETDEV_UNREGISTER: /* In case of reps, ib device goes away before the netdevs */ - write_lock(&roce->netdev_lock); - if (roce->netdev == ndev) - roce->netdev = NULL; - write_unlock(&roce->netdev_lock); - break; + if (ibdev->is_rep) + break; + ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num); + if (ib_ndev == ndev) + ib_device_set_netdev(&ibdev->ib_dev, NULL, port_num); + goto put_ndev; case NETDEV_CHANGE: case NETDEV_UP: case NETDEV_DOWN: { struct net_device *upper = NULL; - if (mlx5_lag_is_roce(mdev)) { + if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) { struct net_device *lag_ndev; - lag_ndev = mlx5_lag_get_roce_netdev(mdev); + if(mlx5_lag_is_roce(mdev)) + lag_ndev = ib_device_get_netdev(&ibdev->ib_dev, 1); + else /* sriov lag */ + lag_ndev = mlx5_ib_get_rep_uplink_netdev(ibdev); + if (lag_ndev) { upper = netdev_master_upper_dev_get(lag_ndev); dev_put(lag_ndev); @@ -216,18 +262,19 @@ static int mlx5_netdev_event(struct notifier_block *this, roce = mlx5_get_rep_roce(ibdev, ndev, upper, &port_num); if (!roce) return NOTIFY_DONE; - if ((upper == ndev || - ((!upper || ibdev->is_rep) && ndev == roce->netdev)) && - ibdev->ib_active) { + + ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num); + + if (mlx5_netdev_send_event(ibdev, ndev, upper, ib_ndev)) { struct ib_event ibev = { }; enum ib_port_state port_state; if (get_port_state(&ibdev->ib_dev, port_num, &port_state)) - goto done; + goto put_ndev; if (roce->last_port_state == port_state) - goto done; + goto put_ndev; roce->last_port_state = port_state; ibev.device = &ibdev->ib_dev; @@ -236,7 +283,7 @@ static int mlx5_netdev_event(struct notifier_block *this, else if (port_state == IB_PORT_ACTIVE) ibev.event = IB_EVENT_PORT_ACTIVE; else - goto done; + goto put_ndev; ibev.element.port_num = port_num; ib_dispatch_event(&ibev); @@ -247,39 +294,13 @@ static int mlx5_netdev_event(struct notifier_block *this, default: break; } +put_ndev: + dev_put(ib_ndev); done: mlx5_ib_put_native_port_mdev(ibdev, port_num); return NOTIFY_DONE; } -static struct net_device *mlx5_ib_get_netdev(struct ib_device *device, - u32 port_num) -{ - struct mlx5_ib_dev *ibdev = to_mdev(device); - struct net_device *ndev; - struct mlx5_core_dev *mdev; - - mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL); - if (!mdev) - return NULL; - - if (mlx5_lag_is_roce(mdev)) { - ndev = mlx5_lag_get_roce_netdev(mdev); - goto out; - } - - /* Ensure ndev does not disappear before we invoke dev_hold() - */ - read_lock(&ibdev->port[port_num - 1].roce.netdev_lock); - ndev = ibdev->port[port_num - 1].roce.netdev; - dev_hold(ndev); - read_unlock(&ibdev->port[port_num - 1].roce.netdev_lock); - -out: - mlx5_ib_put_native_port_mdev(ibdev, port_num); - return ndev; -} - struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev, u32 ib_port_num, u32 *native_port_num) @@ -554,7 +575,7 @@ static int mlx5_query_port_roce(struct ib_device *device, u32 port_num, if (!put_mdev) goto out; - ndev = mlx5_ib_get_netdev(device, port_num); + ndev = ib_device_get_netdev(device, port_num); if (!ndev) goto out; @@ -3185,6 +3206,60 @@ static void get_dev_fw_str(struct ib_device *ibdev, char *str) fw_rev_sub(dev->mdev)); } +static int lag_event(struct notifier_block *nb, unsigned long event, void *data) +{ + struct mlx5_ib_dev *dev = container_of(nb, struct mlx5_ib_dev, + lag_events); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_port *port; + struct net_device *ndev; + int i, err; + int portnum; + + portnum = 0; + switch (event) { + case MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE: + ndev = data; + if (ndev) { + if (!mlx5_lag_is_roce(mdev)) { + // sriov lag + for (i = 0; i < dev->num_ports; i++) { + port = &dev->port[i]; + if (port->rep && port->rep->vport == + MLX5_VPORT_UPLINK) { + portnum = i; + break; + } + } + } + err = ib_device_set_netdev(&dev->ib_dev, ndev, + portnum + 1); + dev_put(ndev); + if (err) + return err; + /* Rescan gids after new netdev assignment */ + rdma_roce_rescan_device(&dev->ib_dev); + } + break; + default: + return NOTIFY_DONE; + } + return NOTIFY_OK; +} + +static void mlx5e_lag_event_register(struct mlx5_ib_dev *dev) +{ + dev->lag_events.notifier_call = lag_event; + blocking_notifier_chain_register(&dev->mdev->priv.lag_nh, + &dev->lag_events); +} + +static void mlx5e_lag_event_unregister(struct mlx5_ib_dev *dev) +{ + blocking_notifier_chain_unregister(&dev->mdev->priv.lag_nh, + &dev->lag_events); +} + static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev) { struct mlx5_core_dev *mdev = dev->mdev; @@ -3206,6 +3281,7 @@ static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev) goto err_destroy_vport_lag; } + mlx5e_lag_event_register(dev); dev->flow_db->lag_demux_ft = ft; dev->lag_ports = mlx5_lag_get_num_ports(mdev); dev->lag_active = true; @@ -3223,6 +3299,7 @@ static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev) if (dev->lag_active) { dev->lag_active = false; + mlx5e_lag_event_unregister(dev); mlx5_destroy_flow_table(dev->flow_db->lag_demux_ft); dev->flow_db->lag_demux_ft = NULL; @@ -3939,7 +4016,6 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) for (i = 0; i < dev->num_ports; i++) { spin_lock_init(&dev->port[i].mp.mpi_lock); - rwlock_init(&dev->port[i].roce.netdev_lock); dev->port[i].roce.dev = dev; dev->port[i].roce.native_port_num = i + 1; dev->port[i].roce.last_port_state = IB_PORT_DOWN; @@ -4204,7 +4280,6 @@ static const struct ib_device_ops mlx5_ib_dev_common_roce_ops = { .create_wq = mlx5_ib_create_wq, .destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table, .destroy_wq = mlx5_ib_destroy_wq, - .get_netdev = mlx5_ib_get_netdev, .modify_wq = mlx5_ib_modify_wq, INIT_RDMA_OBJ_SIZE(ib_rwq_ind_table, mlx5_ib_rwq_ind_table, diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 59ce407ce505..23fd72f7f63d 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -888,8 +888,6 @@ struct mlx5_roce { /* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL * netdev pointer */ - rwlock_t netdev_lock; - struct net_device *netdev; struct notifier_block nb; struct netdev_net_notifier nn; struct notifier_block mdev_nb; @@ -1138,6 +1136,7 @@ struct mlx5_ib_dev { /* protect accessing data_direct_dev */ struct mutex data_direct_lock; struct notifier_block mdev_events; + struct notifier_block lag_events; int num_ports; /* serialize update of capability mask */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index cf8045b92689..8577db3308cc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -445,6 +445,34 @@ static int _mlx5_modify_lag(struct mlx5_lag *ldev, u8 *ports) return mlx5_cmd_modify_lag(dev0, ldev->ports, ports); } +static struct net_device *mlx5_lag_active_backup_get_netdev(struct mlx5_core_dev *dev) +{ + struct net_device *ndev = NULL; + struct mlx5_lag *ldev; + unsigned long flags; + int i; + + spin_lock_irqsave(&lag_lock, flags); + ldev = mlx5_lag_dev(dev); + + if (!ldev) + goto unlock; + + for (i = 0; i < ldev->ports; i++) + if (ldev->tracker.netdev_state[i].tx_enabled) + ndev = ldev->pf[i].netdev; + if (!ndev) + ndev = ldev->pf[ldev->ports - 1].netdev; + + if (ndev) + dev_hold(ndev); + +unlock: + spin_unlock_irqrestore(&lag_lock, flags); + + return ndev; +} + void mlx5_modify_lag(struct mlx5_lag *ldev, struct lag_tracker *tracker) { @@ -477,9 +505,18 @@ void mlx5_modify_lag(struct mlx5_lag *ldev, } } - if (tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP && - !(ldev->mode == MLX5_LAG_MODE_ROCE)) - mlx5_lag_drop_rule_setup(ldev, tracker); + if (tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { + struct net_device *ndev = mlx5_lag_active_backup_get_netdev(dev0); + + if(!(ldev->mode == MLX5_LAG_MODE_ROCE)) + mlx5_lag_drop_rule_setup(ldev, tracker); + /** Only sriov and roce lag should have tracker->tx_type set so + * no need to check the mode + */ + blocking_notifier_call_chain(&dev0->priv.lag_nh, + MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE, + ndev); + } } static int mlx5_lag_set_port_sel_mode_roce(struct mlx5_lag *ldev, @@ -613,6 +650,7 @@ static int mlx5_create_lag(struct mlx5_lag *ldev, mlx5_core_err(dev0, "Failed to deactivate RoCE LAG; driver restart required\n"); } + BLOCKING_INIT_NOTIFIER_HEAD(&dev0->priv.lag_nh); return err; } @@ -1492,38 +1530,6 @@ void mlx5_lag_enable_change(struct mlx5_core_dev *dev) mlx5_queue_bond_work(ldev, 0); } -struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev) -{ - struct net_device *ndev = NULL; - struct mlx5_lag *ldev; - unsigned long flags; - int i; - - spin_lock_irqsave(&lag_lock, flags); - ldev = mlx5_lag_dev(dev); - - if (!(ldev && __mlx5_lag_is_roce(ldev))) - goto unlock; - - if (ldev->tracker.tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { - for (i = 0; i < ldev->ports; i++) - if (ldev->tracker.netdev_state[i].tx_enabled) - ndev = ldev->pf[i].netdev; - if (!ndev) - ndev = ldev->pf[ldev->ports - 1].netdev; - } else { - ndev = ldev->pf[MLX5_LAG_P1].netdev; - } - if (ndev) - dev_hold(ndev); - -unlock: - spin_unlock_irqrestore(&lag_lock, flags); - - return ndev; -} -EXPORT_SYMBOL(mlx5_lag_get_roce_netdev); - u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev, struct net_device *slave) { diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 57c9b18c3adb..dac33cfe9c0c 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -371,6 +371,7 @@ enum mlx5_driver_event { MLX5_DRIVER_EVENT_SF_PEER_DEVLINK, MLX5_DRIVER_EVENT_AFFILIATION_DONE, MLX5_DRIVER_EVENT_AFFILIATION_REMOVED, + MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE, }; enum { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index a96438ded15f..46a7a3d11048 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -643,6 +643,7 @@ struct mlx5_priv { struct mlx5_sf_hw_table *sf_hw_table; struct mlx5_sf_table *sf_table; #endif + struct blocking_notifier_head lag_nh; }; enum mlx5_device_state { @@ -1181,7 +1182,6 @@ bool mlx5_lag_mode_is_hash(struct mlx5_core_dev *dev); bool mlx5_lag_is_master(struct mlx5_core_dev *dev); bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev); bool mlx5_lag_is_mpesw(struct mlx5_core_dev *dev); -struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev); u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev, struct net_device *slave); int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index a1dcf812d787..aa8ede439905 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4453,6 +4453,8 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u32 port, const struct sockaddr *addr); int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, unsigned int port); +struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, + u32 port); struct ib_wq *ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr); int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata); From 9cbed5aab5aeea420d0aa945733bf608449d44fb Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Mon, 9 Sep 2024 20:30:24 +0300 Subject: [PATCH 89/99] RDMA/nldev: Add support for RDMA monitoring Introduce a new netlink command to allow rdma event monitoring. The rdma events supported now are IB device registration/unregistration and net device attachment/detachment. Example output of rdma monitor and the commands which trigger the events: $ rdma monitor $ rmmod mlx5_ib [UNREGISTER] dev 1 rocep8s0f1 [UNREGISTER] dev 0 rocep8s0f0 $ modprobe mlx5_ib [REGISTER] dev 2 mlx5_0 [NETDEV_ATTACH] dev 2 mlx5_0 port 1 netdev 4 eth2 [REGISTER] dev 3 mlx5_1 [NETDEV_ATTACH] dev 3 mlx5_1 port 1 netdev 5 eth3 $ devlink dev eswitch set pci/0000:08:00.0 mode switchdev [UNREGISTER] dev 2 rocep8s0f0 [REGISTER] dev 4 mlx5_0 [NETDEV_ATTACH] dev 4 mlx5_0 port 30 netdev 4 eth2 $ echo 4 > /sys/class/net/eth2/device/sriov_numvfs [NETDEV_ATTACH] dev 4 rdmap8s0f0 port 2 netdev 7 eth4 [NETDEV_ATTACH] dev 4 rdmap8s0f0 port 3 netdev 8 eth5 [NETDEV_ATTACH] dev 4 rdmap8s0f0 port 4 netdev 9 eth6 [NETDEV_ATTACH] dev 4 rdmap8s0f0 port 5 netdev 10 eth7 [REGISTER] dev 5 mlx5_0 [NETDEV_ATTACH] dev 5 mlx5_0 port 1 netdev 11 eth8 [REGISTER] dev 6 mlx5_0 [NETDEV_ATTACH] dev 6 mlx5_0 port 1 netdev 12 eth9 [REGISTER] dev 7 mlx5_0 [NETDEV_ATTACH] dev 7 mlx5_0 port 1 netdev 13 eth10 [REGISTER] dev 8 mlx5_0 [NETDEV_ATTACH] dev 8 mlx5_0 port 1 netdev 14 eth11 $ echo 0 > /sys/class/net/eth2/device/sriov_numvfs [UNREGISTER] dev 5 rocep8s0f0v0 [UNREGISTER] dev 6 rocep8s0f0v1 [UNREGISTER] dev 7 rocep8s0f0v2 [UNREGISTER] dev 8 rocep8s0f0v3 [NETDEV_DETACH] dev 4 rdmap8s0f0 port 2 [NETDEV_DETACH] dev 4 rdmap8s0f0 port 3 [NETDEV_DETACH] dev 4 rdmap8s0f0 port 4 [NETDEV_DETACH] dev 4 rdmap8s0f0 port 5 Signed-off-by: Chiara Meiohas Signed-off-by: Michael Guralnik Link: https://patch.msgid.link/20240909173025.30422-7-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 35 +++++++++ drivers/infiniband/core/netlink.c | 1 + drivers/infiniband/core/nldev.c | 124 ++++++++++++++++++++++++++++++ include/rdma/rdma_netlink.h | 12 +++ include/uapi/rdma/rdma_netlink.h | 15 ++++ 5 files changed, 187 insertions(+) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 9e765c79a892..e029401b5680 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1351,6 +1351,29 @@ static void prevent_dealloc_device(struct ib_device *ib_dev) { } +static void ib_device_notify_register(struct ib_device *device) +{ + struct net_device *netdev; + u32 port; + int ret; + + ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT); + if (ret) + return; + + rdma_for_each_port(device, port) { + netdev = ib_device_get_netdev(device, port); + if (!netdev) + continue; + + ret = rdma_nl_notify_event(device, port, + RDMA_NETDEV_ATTACH_EVENT); + dev_put(netdev); + if (ret) + return; + } +} + /** * ib_register_device - Register an IB device with IB core * @device: Device to register @@ -1449,6 +1472,8 @@ int ib_register_device(struct ib_device *device, const char *name, dev_set_uevent_suppress(&device->dev, false); /* Mark for userspace that device is ready */ kobject_uevent(&device->dev.kobj, KOBJ_ADD); + + ib_device_notify_register(device); ib_device_put(device); return 0; @@ -1491,6 +1516,7 @@ static void __ib_unregister_device(struct ib_device *ib_dev) goto out; disable_device(ib_dev); + rdma_nl_notify_event(ib_dev, 0, RDMA_UNREGISTER_EVENT); /* Expedite removing unregistered pointers from the hash table */ free_netdevs(ib_dev); @@ -2159,6 +2185,7 @@ static void add_ndev_hash(struct ib_port_data *pdata) int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, u32 port) { + enum rdma_nl_notify_event_type etype; struct net_device *old_ndev; struct ib_port_data *pdata; unsigned long flags; @@ -2190,6 +2217,14 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, spin_unlock_irqrestore(&pdata->netdev_lock, flags); add_ndev_hash(pdata); + + /* Make sure that the device is registered before we send events */ + if (xa_load(&devices, ib_dev->index) != ib_dev) + return 0; + + etype = ndev ? RDMA_NETDEV_ATTACH_EVENT : RDMA_NETDEV_DETACH_EVENT; + rdma_nl_notify_event(ib_dev, port, etype); + return 0; } EXPORT_SYMBOL(ib_device_set_netdev); diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index ae2db0c70788..def14c54b648 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -311,6 +311,7 @@ int rdma_nl_net_init(struct rdma_dev_net *rnet) struct net *net = read_pnet(&rnet->net); struct netlink_kernel_cfg cfg = { .input = rdma_nl_rcv, + .flags = NL_CFG_F_NONROOT_RECV, }; struct sock *nls; diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 4d4a1f90e484..70b3fa0469f2 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -170,6 +170,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_DEV_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_PARENT_NAME] = { .type = NLA_NUL_STRING }, [RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_EVENT_TYPE] = { .type = NLA_U8 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -2722,6 +2723,129 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { }, }; +static int fill_mon_netdev_association(struct sk_buff *msg, + struct ib_device *device, u32 port, + const struct net *net) +{ + struct net_device *netdev = ib_device_get_netdev(device, port); + int ret = 0; + + if (netdev && !net_eq(dev_net(netdev), net)) + goto out; + + ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index); + if (ret) + goto out; + + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, + dev_name(&device->dev)); + if (ret) + goto out; + + ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port); + if (ret) + goto out; + + if (netdev) { + ret = nla_put_u32(msg, + RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex); + if (ret) + goto out; + + ret = nla_put_string(msg, + RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name); + } + +out: + dev_put(netdev); + return ret; +} + +static void rdma_nl_notify_err_msg(struct ib_device *device, u32 port_num, + enum rdma_nl_notify_event_type type) +{ + struct net_device *netdev; + + switch (type) { + case RDMA_REGISTER_EVENT: + dev_warn_ratelimited(&device->dev, + "Failed to send RDMA monitor register device event\n"); + break; + case RDMA_UNREGISTER_EVENT: + dev_warn_ratelimited(&device->dev, + "Failed to send RDMA monitor unregister device event\n"); + break; + case RDMA_NETDEV_ATTACH_EVENT: + netdev = ib_device_get_netdev(device, port_num); + dev_warn_ratelimited(&device->dev, + "Failed to send RDMA monitor netdev attach event: port %d netdev %d\n", + port_num, netdev->ifindex); + dev_put(netdev); + break; + case RDMA_NETDEV_DETACH_EVENT: + dev_warn_ratelimited(&device->dev, + "Failed to send RDMA monitor netdev detach event: port %d\n", + port_num); + default: + break; + } +} + +int rdma_nl_notify_event(struct ib_device *device, u32 port_num, + enum rdma_nl_notify_event_type type) +{ + struct sk_buff *skb; + struct net *net; + int ret = 0; + void *nlh; + + net = read_pnet(&device->coredev.rdma_net); + if (!net) + return -EINVAL; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + nlh = nlmsg_put(skb, 0, 0, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_MONITOR), + 0, 0); + + switch (type) { + case RDMA_REGISTER_EVENT: + case RDMA_UNREGISTER_EVENT: + ret = fill_nldev_handle(skb, device); + if (ret) + goto err_free; + break; + case RDMA_NETDEV_ATTACH_EVENT: + case RDMA_NETDEV_DETACH_EVENT: + ret = fill_mon_netdev_association(skb, device, + port_num, net); + if (ret) + goto err_free; + break; + default: + break; + } + + ret = nla_put_u8(skb, RDMA_NLDEV_ATTR_EVENT_TYPE, type); + if (ret) + goto err_free; + + nlmsg_end(skb, nlh); + ret = rdma_nl_multicast(net, skb, RDMA_NL_GROUP_NOTIFY, GFP_KERNEL); + if (ret && ret != -ESRCH) { + skb = NULL; /* skb is freed in the netlink send-op handling */ + goto err_free; + } + return 0; + +err_free: + rdma_nl_notify_err_msg(device, port_num, type); + nlmsg_free(skb); + return ret; +} + void __init nldev_init(void) { rdma_nl_register(RDMA_NL_NLDEV, nldev_cb_table); diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index c2a79aeee113..326deaf56d5d 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -6,6 +6,8 @@ #include #include +struct ib_device; + enum { RDMA_NLDEV_ATTR_EMPTY_STRING = 1, RDMA_NLDEV_ATTR_ENTRY_STRLEN = 16, @@ -110,6 +112,16 @@ int rdma_nl_multicast(struct net *net, struct sk_buff *skb, */ bool rdma_nl_chk_listeners(unsigned int group); +/** + * Prepare and send an event message + * @ib: the IB device which triggered the event + * @port_num: the port number which triggered the event - 0 if unused + * @type: the event type + * Returns 0 on success or a negative error code + */ +int rdma_nl_notify_event(struct ib_device *ib, u32 port_num, + enum rdma_nl_notify_event_type type); + struct rdma_link_ops { struct list_head list; const char *type; diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 2f37568f5556..5f9636d26050 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -15,6 +15,7 @@ enum { enum { RDMA_NL_GROUP_IWPM = 2, RDMA_NL_GROUP_LS, + RDMA_NL_GROUP_NOTIFY, RDMA_NL_NUM_GROUPS }; @@ -305,6 +306,8 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_DELDEV, + RDMA_NLDEV_CMD_MONITOR, + RDMA_NLDEV_NUM_OPS }; @@ -574,6 +577,8 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE, /* u8 */ + RDMA_NLDEV_ATTR_EVENT_TYPE, /* u8 */ + /* * Always the end */ @@ -624,4 +629,14 @@ enum rdma_nl_name_assign_type { RDMA_NAME_ASSIGN_TYPE_USER = 1, /* Provided by user-space */ }; +/* + * Supported rdma monitoring event types. + */ +enum rdma_nl_notify_event_type { + RDMA_REGISTER_EVENT, + RDMA_UNREGISTER_EVENT, + RDMA_NETDEV_ATTACH_EVENT, + RDMA_NETDEV_DETACH_EVENT, +}; + #endif /* _UAPI_RDMA_NETLINK_H */ From 12fb1153c53bf9b53e299c9775b84fa7838640f7 Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Mon, 9 Sep 2024 20:30:25 +0300 Subject: [PATCH 90/99] RDMA/nldev: Expose whether RDMA monitoring is supported Extend the "rdma sys" command to display whether RDMA monitoring is supported. RDMA monitoring is not supported in mlx4 because it does not use the ib_device_set_netdev() API, which sends the RDMA events. Example output for kernel where monitoring is supported: $ rdma sys show netns shared privileged-qkey off monitor on copy-on-fork on Example output for kernel where monitoring is not supported: $ rdma sys show netns shared privileged-qkey off monitor off copy-on-fork on Signed-off-by: Chiara Meiohas Signed-off-by: Michael Guralnik Link: https://patch.msgid.link/20240909173025.30422-8-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 6 ++++++ include/uapi/rdma/rdma_netlink.h | 1 + 2 files changed, 7 insertions(+) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 70b3fa0469f2..10b1411ac53d 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -1952,6 +1952,12 @@ static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlmsg_free(msg); return err; } + + err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_MONITOR_MODE, 1); + if (err) { + nlmsg_free(msg); + return err; + } /* * Copy-on-fork is supported. * See commits: diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 5f9636d26050..39be09c0ffbb 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -579,6 +579,7 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_EVENT_TYPE, /* u8 */ + RDMA_NLDEV_SYS_ATTR_MONITOR_MODE, /* u8 */ /* * Always the end */ From dee3da3422d5e8658b996243dd1ddc774bbf31f3 Mon Sep 17 00:00:00 2001 From: Chandramohan Akula Date: Tue, 10 Sep 2024 21:08:27 -0700 Subject: [PATCH 91/99] RDMA/bnxt_re: Change aux driver data to en_info to hold more information rdev will be destroyed and recreated during the FW error recovery scenarios. So to keep the state, if any, use an en_info structure which gets created/freed based on auxiliary device initialization/de-initialization. Signed-off-by: Chandramohan Akula Reviewed-by: Kashyap Desai Reviewed-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1726027710-2292-2-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 6 ++ drivers/infiniband/hw/bnxt_re/main.c | 73 +++++++++++++++++++++---- 2 files changed, 67 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 2be9a62d230f..5df3ce1284c7 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -107,6 +107,11 @@ struct bnxt_re_gsi_context { struct bnxt_re_sqp_entries *sqp_tbl; }; +struct bnxt_re_en_dev_info { + struct bnxt_en_dev *en_dev; + struct bnxt_re_dev *rdev; +}; + #define BNXT_RE_AEQ_IDX 0 #define BNXT_RE_NQ_IDX 1 #define BNXT_RE_GEN_P5_MAX_VF 64 @@ -155,6 +160,7 @@ struct bnxt_re_dev { #define BNXT_RE_FLAG_ERR_DEVICE_DETACHED 17 #define BNXT_RE_FLAG_ISSUE_ROCE_STATS 29 struct net_device *netdev; + struct auxiliary_device *adev; struct notifier_block nb; unsigned int version, major, minor; struct bnxt_qplib_chip_ctx *chip_ctx; diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 16a84ca1ce48..085a03cc6d52 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -292,10 +292,13 @@ static void bnxt_re_vf_res_config(struct bnxt_re_dev *rdev) static void bnxt_re_shutdown(struct auxiliary_device *adev) { - struct bnxt_re_dev *rdev = auxiliary_get_drvdata(adev); + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev); + struct bnxt_re_dev *rdev; - if (!rdev) + if (!en_info) return; + + rdev = en_info->rdev; ib_unregister_device(&rdev->ibdev); bnxt_re_dev_uninit(rdev); } @@ -1794,14 +1797,33 @@ fail: return rc; } +static void bnxt_re_update_en_info_rdev(struct bnxt_re_dev *rdev, + struct bnxt_re_en_dev_info *en_info, + struct auxiliary_device *adev) +{ + /* Before updating the rdev pointer in bnxt_re_en_dev_info structure, + * take the rtnl lock to avoid accessing invalid rdev pointer from + * L2 ULP callbacks. This is applicable in all the places where rdev + * pointer is updated in bnxt_re_en_dev_info. + */ + rtnl_lock(); + en_info->rdev = rdev; + rdev->adev = adev; + rtnl_unlock(); +} + static int bnxt_re_add_device(struct auxiliary_device *adev) { struct bnxt_aux_priv *aux_priv = container_of(adev, struct bnxt_aux_priv, aux_dev); + struct bnxt_re_en_dev_info *en_info; struct bnxt_en_dev *en_dev; struct bnxt_re_dev *rdev; int rc; + en_info = auxiliary_get_drvdata(adev); + en_dev = en_info->en_dev; + /* en_dev should never be NULL as long as adev and aux_dev are valid. */ en_dev = aux_priv->edev; @@ -1811,6 +1833,8 @@ static int bnxt_re_add_device(struct auxiliary_device *adev) goto exit; } + bnxt_re_update_en_info_rdev(rdev, en_info, adev); + rc = bnxt_re_dev_init(rdev); if (rc) goto re_dev_dealloc; @@ -1821,11 +1845,11 @@ static int bnxt_re_add_device(struct auxiliary_device *adev) aux_priv->aux_dev.name); goto re_dev_uninit; } - auxiliary_set_drvdata(adev, rdev); return 0; re_dev_uninit: + bnxt_re_update_en_info_rdev(NULL, en_info, adev); bnxt_re_dev_uninit(rdev); re_dev_dealloc: ib_dealloc_device(&rdev->ibdev); @@ -1911,12 +1935,18 @@ exit: static void bnxt_re_remove(struct auxiliary_device *adev) { - struct bnxt_re_dev *rdev = auxiliary_get_drvdata(adev); - - if (!rdev) - return; + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev); + struct bnxt_re_dev *rdev; mutex_lock(&bnxt_re_mutex); + if (!en_info) { + mutex_unlock(&bnxt_re_mutex); + return; + } + rdev = en_info->rdev; + if (!rdev) + goto skip_remove; + if (rdev->nb.notifier_call) { unregister_netdevice_notifier(&rdev->nb); rdev->nb.notifier_call = NULL; @@ -1931,16 +1961,31 @@ static void bnxt_re_remove(struct auxiliary_device *adev) bnxt_re_dev_uninit(rdev); ib_dealloc_device(&rdev->ibdev); skip_remove: + kfree(en_info); mutex_unlock(&bnxt_re_mutex); } static int bnxt_re_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id) { + struct bnxt_aux_priv *aux_priv = + container_of(adev, struct bnxt_aux_priv, aux_dev); + struct bnxt_re_en_dev_info *en_info; + struct bnxt_en_dev *en_dev; struct bnxt_re_dev *rdev; int rc; + en_dev = aux_priv->edev; + mutex_lock(&bnxt_re_mutex); + en_info = kzalloc(sizeof(*en_info), GFP_KERNEL); + if (!en_info) { + mutex_unlock(&bnxt_re_mutex); + return -ENOMEM; + } + en_info->en_dev = en_dev; + + auxiliary_set_drvdata(adev, en_info); rc = bnxt_re_add_device(adev); if (rc) { @@ -1948,7 +1993,7 @@ static int bnxt_re_probe(struct auxiliary_device *adev, return rc; } - rdev = auxiliary_get_drvdata(adev); + rdev = en_info->rdev; rdev->nb.notifier_call = bnxt_re_netdev_event; rc = register_netdevice_notifier(&rdev->nb); @@ -1972,11 +2017,13 @@ err: static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state) { - struct bnxt_re_dev *rdev = auxiliary_get_drvdata(adev); + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev); + struct bnxt_re_dev *rdev; - if (!rdev) + if (!en_info) return 0; + rdev = en_info->rdev; mutex_lock(&bnxt_re_mutex); /* L2 driver may invoke this callback during device error/crash or device * reset. Current RoCE driver doesn't recover the device in case of @@ -2009,11 +2056,13 @@ static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state) static int bnxt_re_resume(struct auxiliary_device *adev) { - struct bnxt_re_dev *rdev = auxiliary_get_drvdata(adev); + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev); + struct bnxt_re_dev *rdev; - if (!rdev) + if (!en_info) return 0; + rdev = en_info->rdev; mutex_lock(&bnxt_re_mutex); /* L2 driver may invoke this callback during device recovery, resume. * reset. Current RoCE driver doesn't recover the device in case of From 532929ad0a23b9b94ab840e50ca0486033611914 Mon Sep 17 00:00:00 2001 From: Chandramohan Akula Date: Tue, 10 Sep 2024 21:08:28 -0700 Subject: [PATCH 92/99] RDMA/bnxt_re: Use the aux device for L2 ULP callbacks While registering with the L2 for ULP operations, use the aux device pointer as the handle. Aux device has the data bnxt_re_en_dev_info, which is used to store required information for the bnxt_re_suspend and bnxt_re_resume functions. Signed-off-by: Chandramohan Akula Reviewed-by: Kalesh AP Reviewed-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1726027710-2292-3-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 085a03cc6d52..2a916998348e 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -305,11 +305,18 @@ static void bnxt_re_shutdown(struct auxiliary_device *adev) static void bnxt_re_stop_irq(void *handle) { - struct bnxt_re_dev *rdev = (struct bnxt_re_dev *)handle; - struct bnxt_qplib_rcfw *rcfw = &rdev->rcfw; + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(handle); + struct bnxt_qplib_rcfw *rcfw; + struct bnxt_re_dev *rdev; struct bnxt_qplib_nq *nq; int indx; + if (!en_info) + return; + + rdev = en_info->rdev; + rcfw = &rdev->rcfw; + for (indx = BNXT_RE_NQ_IDX; indx < rdev->num_msix; indx++) { nq = &rdev->nq[indx - 1]; bnxt_qplib_nq_stop_irq(nq, false); @@ -320,12 +327,19 @@ static void bnxt_re_stop_irq(void *handle) static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent) { - struct bnxt_re_dev *rdev = (struct bnxt_re_dev *)handle; - struct bnxt_msix_entry *msix_ent = rdev->en_dev->msix_entries; - struct bnxt_qplib_rcfw *rcfw = &rdev->rcfw; + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(handle); + struct bnxt_msix_entry *msix_ent; + struct bnxt_qplib_rcfw *rcfw; + struct bnxt_re_dev *rdev; struct bnxt_qplib_nq *nq; int indx, rc; + if (!en_info) + return; + + rdev = en_info->rdev; + msix_ent = rdev->en_dev->msix_entries; + rcfw = &rdev->rcfw; if (!ent) { /* Not setting the f/w timeout bit in rcfw. * During the driver unload the first command @@ -374,7 +388,7 @@ static int bnxt_re_register_netdev(struct bnxt_re_dev *rdev) en_dev = rdev->en_dev; - rc = bnxt_register_dev(en_dev, &bnxt_re_ulp_ops, rdev); + rc = bnxt_register_dev(en_dev, &bnxt_re_ulp_ops, rdev->adev); if (!rc) rdev->qplib_res.pdev = rdev->en_dev->pdev; return rc; From 94a9dc6ac8f7e6801c88d05c42ed9ceaa4b5f609 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 10 Sep 2024 21:08:29 -0700 Subject: [PATCH 93/99] RDMA/bnxt_re: Group all operations under add_device and remove_device Adding and removing device need to be handled from multiple contexts when Firmware error recovery is supported. So group all the add and remove operations to add_device and remove_device function. Signed-off-by: Chandramohan Akula Reviewed-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1726027710-2292-4-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 65 ++++++++++++++-------------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 2a916998348e..dc63ad07b2c2 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -88,6 +88,7 @@ static int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev); static int bnxt_re_hwrm_qcfg(struct bnxt_re_dev *rdev, u32 *db_len, u32 *offset); +static void bnxt_re_setup_cc(struct bnxt_re_dev *rdev, bool enable); static void bnxt_re_set_db_offset(struct bnxt_re_dev *rdev) { struct bnxt_qplib_chip_ctx *cctx; @@ -1860,6 +1861,16 @@ static int bnxt_re_add_device(struct auxiliary_device *adev) goto re_dev_uninit; } + rdev->nb.notifier_call = bnxt_re_netdev_event; + rc = register_netdevice_notifier(&rdev->nb); + if (rc) { + rdev->nb.notifier_call = NULL; + pr_err("%s: Cannot register to netdevice_notifier", + ROCE_DRV_MODULE_NAME); + return rc; + } + bnxt_re_setup_cc(rdev, true); + return 0; re_dev_uninit: @@ -1947,20 +1958,9 @@ exit: #define BNXT_ADEV_NAME "bnxt_en" -static void bnxt_re_remove(struct auxiliary_device *adev) +static void bnxt_re_remove_device(struct bnxt_re_dev *rdev, + struct auxiliary_device *aux_dev) { - struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev); - struct bnxt_re_dev *rdev; - - mutex_lock(&bnxt_re_mutex); - if (!en_info) { - mutex_unlock(&bnxt_re_mutex); - return; - } - rdev = en_info->rdev; - if (!rdev) - goto skip_remove; - if (rdev->nb.notifier_call) { unregister_netdevice_notifier(&rdev->nb); rdev->nb.notifier_call = NULL; @@ -1968,13 +1968,30 @@ static void bnxt_re_remove(struct auxiliary_device *adev) /* If notifier is null, we should have already done a * clean up before coming here. */ - goto skip_remove; + return; } bnxt_re_setup_cc(rdev, false); ib_unregister_device(&rdev->ibdev); bnxt_re_dev_uninit(rdev); ib_dealloc_device(&rdev->ibdev); -skip_remove: +} + +static void bnxt_re_remove(struct auxiliary_device *adev) +{ + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev); + struct bnxt_en_dev *en_dev; + struct bnxt_re_dev *rdev; + + mutex_lock(&bnxt_re_mutex); + if (!en_info) { + mutex_unlock(&bnxt_re_mutex); + return; + } + en_dev = en_info->en_dev; + rdev = en_info->rdev; + + if (rdev) + bnxt_re_remove_device(rdev, adev); kfree(en_info); mutex_unlock(&bnxt_re_mutex); } @@ -1986,7 +2003,6 @@ static int bnxt_re_probe(struct auxiliary_device *adev, container_of(adev, struct bnxt_aux_priv, aux_dev); struct bnxt_re_en_dev_info *en_info; struct bnxt_en_dev *en_dev; - struct bnxt_re_dev *rdev; int rc; en_dev = aux_priv->edev; @@ -2002,23 +2018,8 @@ static int bnxt_re_probe(struct auxiliary_device *adev, auxiliary_set_drvdata(adev, en_info); rc = bnxt_re_add_device(adev); - if (rc) { - mutex_unlock(&bnxt_re_mutex); - return rc; - } - - rdev = en_info->rdev; - - rdev->nb.notifier_call = bnxt_re_netdev_event; - rc = register_netdevice_notifier(&rdev->nb); - if (rc) { - rdev->nb.notifier_call = NULL; - pr_err("%s: Cannot register to netdevice_notifier", - ROCE_DRV_MODULE_NAME); + if (rc) goto err; - } - - bnxt_re_setup_cc(rdev, true); mutex_unlock(&bnxt_re_mutex); return 0; From cc5b9b48d44756a87170f3901c6c2fd99e6b89b2 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 10 Sep 2024 21:08:30 -0700 Subject: [PATCH 94/99] RDMA/bnxt_re: Recover the device when FW error is detected If the FW crashes, L2 driver gets notified and it notifies the RoCE driver. Currently driver doesn't re-initialize the device. Add support for re-initialize the RoCE device. RoCE device is removed and re-attached in the ulp_stop and ulp_start respectively. The recovery logic expects the RoCE driver to be registered with L2 driver while its being removed. So the driver avoids unregistering with L2 driver in the recovery path. Signed-off-by: Chandramohan Akula Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1726027710-2292-5-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 15 +++++ drivers/infiniband/hw/bnxt_re/main.c | 70 +++++++++++++---------- drivers/infiniband/hw/bnxt_re/qplib_res.h | 1 + 3 files changed, 55 insertions(+), 31 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 5df3ce1284c7..e94518b12f86 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -91,6 +91,15 @@ struct bnxt_re_ring_attr { u8 mode; }; +/* + * Data structure and defines to handle + * recovery + */ +#define BNXT_RE_PRE_RECOVERY_REMOVE 0x1 +#define BNXT_RE_COMPLETE_REMOVE 0x2 +#define BNXT_RE_POST_RECOVERY_INIT 0x4 +#define BNXT_RE_COMPLETE_INIT 0x8 + struct bnxt_re_sqp_entries { struct bnxt_qplib_sge sge; u64 wrid; @@ -224,4 +233,10 @@ static inline struct device *rdev_to_dev(struct bnxt_re_dev *rdev) } extern const struct uapi_definition bnxt_re_uapi_defs[]; + +static inline void bnxt_re_set_pacing_dev_state(struct bnxt_re_dev *rdev) +{ + rdev->qplib_res.pacing_data->dev_err_state = + test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags); +} #endif diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index dc63ad07b2c2..adff9e494c9d 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -83,7 +83,7 @@ static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev); static int bnxt_re_netdev_event(struct notifier_block *notifier, unsigned long event, void *ptr); static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev); -static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev); +static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type); static int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev); static int bnxt_re_hwrm_qcfg(struct bnxt_re_dev *rdev, u32 *db_len, @@ -169,6 +169,7 @@ static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev) en_dev = rdev->en_dev; + rdev->qplib_res.pdev = en_dev->pdev; chip_ctx = kzalloc(sizeof(*chip_ctx), GFP_KERNEL); if (!chip_ctx) return -ENOMEM; @@ -301,7 +302,7 @@ static void bnxt_re_shutdown(struct auxiliary_device *adev) rdev = en_info->rdev; ib_unregister_device(&rdev->ibdev); - bnxt_re_dev_uninit(rdev); + bnxt_re_dev_uninit(rdev, BNXT_RE_COMPLETE_REMOVE); } static void bnxt_re_stop_irq(void *handle) @@ -385,14 +386,9 @@ static struct bnxt_ulp_ops bnxt_re_ulp_ops = { static int bnxt_re_register_netdev(struct bnxt_re_dev *rdev) { struct bnxt_en_dev *en_dev; - int rc; en_dev = rdev->en_dev; - - rc = bnxt_register_dev(en_dev, &bnxt_re_ulp_ops, rdev->adev); - if (!rc) - rdev->qplib_res.pdev = rdev->en_dev->pdev; - return rc; + return bnxt_register_dev(en_dev, &bnxt_re_ulp_ops, rdev->adev); } static void bnxt_re_init_hwrm_hdr(struct input *hdr, u16 opcd) @@ -1593,7 +1589,7 @@ static int bnxt_re_ib_init(struct bnxt_re_dev *rdev) return rc; } -static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev) +static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type) { u8 type; int rc; @@ -1626,8 +1622,10 @@ static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev) bnxt_re_deinitialize_dbr_pacing(rdev); bnxt_re_destroy_chip_ctx(rdev); - if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) - bnxt_unregister_dev(rdev->en_dev); + if (op_type == BNXT_RE_COMPLETE_REMOVE) { + if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) + bnxt_unregister_dev(rdev->en_dev); + } } /* worker thread for polling periodic events. Now used for QoS programming*/ @@ -1640,7 +1638,7 @@ static void bnxt_re_worker(struct work_struct *work) schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000)); } -static int bnxt_re_dev_init(struct bnxt_re_dev *rdev) +static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type) { struct bnxt_re_ring_attr rattr = {}; struct bnxt_qplib_creq_ctx *creq; @@ -1649,12 +1647,14 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev) u8 type; int rc; - /* Registered a new RoCE device instance to netdev */ - rc = bnxt_re_register_netdev(rdev); - if (rc) { - ibdev_err(&rdev->ibdev, - "Failed to register with netedev: %#x\n", rc); - return -EINVAL; + if (op_type == BNXT_RE_COMPLETE_INIT) { + /* Registered a new RoCE device instance to netdev */ + rc = bnxt_re_register_netdev(rdev); + if (rc) { + ibdev_err(&rdev->ibdev, + "Failed to register with netedev: %#x\n", rc); + return -EINVAL; + } } set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); @@ -1807,7 +1807,7 @@ free_ring: free_rcfw: bnxt_qplib_free_rcfw_channel(&rdev->rcfw); fail: - bnxt_re_dev_uninit(rdev); + bnxt_re_dev_uninit(rdev, BNXT_RE_COMPLETE_REMOVE); return rc; } @@ -1827,7 +1827,7 @@ static void bnxt_re_update_en_info_rdev(struct bnxt_re_dev *rdev, rtnl_unlock(); } -static int bnxt_re_add_device(struct auxiliary_device *adev) +static int bnxt_re_add_device(struct auxiliary_device *adev, u8 op_type) { struct bnxt_aux_priv *aux_priv = container_of(adev, struct bnxt_aux_priv, aux_dev); @@ -1839,8 +1839,6 @@ static int bnxt_re_add_device(struct auxiliary_device *adev) en_info = auxiliary_get_drvdata(adev); en_dev = en_info->en_dev; - /* en_dev should never be NULL as long as adev and aux_dev are valid. */ - en_dev = aux_priv->edev; rdev = bnxt_re_dev_add(aux_priv, en_dev); if (!rdev || !rdev_to_dev(rdev)) { @@ -1850,7 +1848,7 @@ static int bnxt_re_add_device(struct auxiliary_device *adev) bnxt_re_update_en_info_rdev(rdev, en_info, adev); - rc = bnxt_re_dev_init(rdev); + rc = bnxt_re_dev_init(rdev, op_type); if (rc) goto re_dev_dealloc; @@ -1875,7 +1873,7 @@ static int bnxt_re_add_device(struct auxiliary_device *adev) re_dev_uninit: bnxt_re_update_en_info_rdev(NULL, en_info, adev); - bnxt_re_dev_uninit(rdev); + bnxt_re_dev_uninit(rdev, BNXT_RE_COMPLETE_REMOVE); re_dev_dealloc: ib_dealloc_device(&rdev->ibdev); exit: @@ -1958,7 +1956,7 @@ exit: #define BNXT_ADEV_NAME "bnxt_en" -static void bnxt_re_remove_device(struct bnxt_re_dev *rdev, +static void bnxt_re_remove_device(struct bnxt_re_dev *rdev, u8 op_type, struct auxiliary_device *aux_dev) { if (rdev->nb.notifier_call) { @@ -1972,7 +1970,7 @@ static void bnxt_re_remove_device(struct bnxt_re_dev *rdev, } bnxt_re_setup_cc(rdev, false); ib_unregister_device(&rdev->ibdev); - bnxt_re_dev_uninit(rdev); + bnxt_re_dev_uninit(rdev, op_type); ib_dealloc_device(&rdev->ibdev); } @@ -1991,7 +1989,7 @@ static void bnxt_re_remove(struct auxiliary_device *adev) rdev = en_info->rdev; if (rdev) - bnxt_re_remove_device(rdev, adev); + bnxt_re_remove_device(rdev, BNXT_RE_COMPLETE_REMOVE, adev); kfree(en_info); mutex_unlock(&bnxt_re_mutex); } @@ -2017,7 +2015,7 @@ static int bnxt_re_probe(struct auxiliary_device *adev, auxiliary_set_drvdata(adev, en_info); - rc = bnxt_re_add_device(adev); + rc = bnxt_re_add_device(adev, BNXT_RE_COMPLETE_INIT); if (rc) goto err; mutex_unlock(&bnxt_re_mutex); @@ -2033,12 +2031,14 @@ err: static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state) { struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev); + struct bnxt_en_dev *en_dev; struct bnxt_re_dev *rdev; if (!en_info) return 0; rdev = en_info->rdev; + en_dev = en_info->en_dev; mutex_lock(&bnxt_re_mutex); /* L2 driver may invoke this callback during device error/crash or device * reset. Current RoCE driver doesn't recover the device in case of @@ -2057,13 +2057,20 @@ static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state) set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags); bnxt_re_dev_stop(rdev); - bnxt_re_stop_irq(rdev); + bnxt_re_stop_irq(adev); /* Move the device states to detached and avoid sending any more * commands to HW */ set_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags); set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags); wake_up_all(&rdev->rcfw.cmdq.waitq); + + if (rdev->pacing.dbr_pacing) + bnxt_re_set_pacing_dev_state(rdev); + + ibdev_info(&rdev->ibdev, "%s: L2 driver notified to stop en_state 0x%lx", + __func__, en_dev->en_state); + bnxt_re_remove_device(rdev, BNXT_RE_PRE_RECOVERY_REMOVE, adev); mutex_unlock(&bnxt_re_mutex); return 0; @@ -2077,7 +2084,6 @@ static int bnxt_re_resume(struct auxiliary_device *adev) if (!en_info) return 0; - rdev = en_info->rdev; mutex_lock(&bnxt_re_mutex); /* L2 driver may invoke this callback during device recovery, resume. * reset. Current RoCE driver doesn't recover the device in case of @@ -2086,7 +2092,9 @@ static int bnxt_re_resume(struct auxiliary_device *adev) * L2 driver want to modify the MSIx table. */ - ibdev_info(&rdev->ibdev, "Handle device resume call"); + bnxt_re_add_device(adev, BNXT_RE_POST_RECOVERY_INIT); + rdev = en_info->rdev; + ibdev_info(&rdev->ibdev, "Device resume completed"); mutex_unlock(&bnxt_re_mutex); return 0; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index 049805ac95cf..c2f710364e0f 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -82,6 +82,7 @@ struct bnxt_qplib_db_pacing_data { u32 fifo_room_mask; u32 fifo_room_shift; u32 grc_reg_offset; + u32 dev_err_state; }; #define BNXT_QPLIB_DBR_PF_DB_OFFSET 0x10000 From 39c047d4047a1242aeefa87513174b56a91080ab Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Thu, 12 Sep 2024 19:57:00 +0800 Subject: [PATCH 95/99] RDMA/hns: Fix ah error counter in sw stat not increasing There are several error cases where hns_roce_create_ah() returns directly without jumping to sw stat path, thus leading to a problem that the ah error counter does not increase. Fixes: ee20cc17e9d8 ("RDMA/hns: Support DSCP") Fixes: eb7854d63db5 ("RDMA/hns: Support SW stats with debugfs") Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20240912115700.2016443-1-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_ah.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c index 3e02c474f59f..4fc5b9d5fea8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_ah.c +++ b/drivers/infiniband/hw/hns/hns_roce_ah.c @@ -64,8 +64,10 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, u8 tc_mode = 0; int ret; - if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08 && udata) - return -EOPNOTSUPP; + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08 && udata) { + ret = -EOPNOTSUPP; + goto err_out; + } ah->av.port = rdma_ah_get_port_num(ah_attr); ah->av.gid_index = grh->sgid_index; @@ -83,7 +85,7 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, ret = 0; if (ret && grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) - return ret; + goto err_out; if (tc_mode == HNAE3_TC_MAP_MODE_DSCP && grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) @@ -91,8 +93,10 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, else ah->av.sl = rdma_ah_get_sl(ah_attr); - if (!check_sl_valid(hr_dev, ah->av.sl)) - return -EINVAL; + if (!check_sl_valid(hr_dev, ah->av.sl)) { + ret = -EINVAL; + goto err_out; + } memcpy(ah->av.dgid, grh->dgid.raw, HNS_ROCE_GID_SIZE); memcpy(ah->av.mac, ah_attr->roce.dmac, ETH_ALEN); From e766e6a92410ca269161de059fff0843b8ddd65f Mon Sep 17 00:00:00 2001 From: Mikhail Lobanov Date: Thu, 12 Sep 2024 10:58:39 -0400 Subject: [PATCH 96/99] RDMA/cxgb4: Added NULL check for lookup_atid The lookup_atid() function can return NULL if the ATID is invalid or does not exist in the identifier table, which could lead to dereferencing a null pointer without a check in the `act_establish()` and `act_open_rpl()` functions. Add a NULL check to prevent null pointer dereferencing. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: cfdda9d76436 ("RDMA/cxgb4: Add driver for Chelsio T4 RNIC") Signed-off-by: Mikhail Lobanov Link: https://patch.msgid.link/20240912145844.77516-1-m.lobanov@rosalinux.ru Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/cxgb4/cm.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 040ba2224f9f..b3757c6a0457 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -1222,6 +1222,8 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb) int ret; ep = lookup_atid(t, atid); + if (!ep) + return -EINVAL; pr_debug("ep %p tid %u snd_isn %u rcv_isn %u\n", ep, tid, be32_to_cpu(req->snd_isn), be32_to_cpu(req->rcv_isn)); @@ -2279,6 +2281,9 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) int ret = 0; ep = lookup_atid(t, atid); + if (!ep) + return -EINVAL; + la = (struct sockaddr_in *)&ep->com.local_addr; ra = (struct sockaddr_in *)&ep->com.remote_addr; la6 = (struct sockaddr_in6 *)&ep->com.local_addr; From 9f0eafe86ea0a589676209d0cff1a1ed49a037d3 Mon Sep 17 00:00:00 2001 From: Vitaliy Shevtsov Date: Mon, 16 Sep 2024 21:58:05 +0500 Subject: [PATCH 97/99] RDMA/irdma: fix error message in irdma_modify_qp_roce() Use a correct field max_dest_rd_atomic instead of max_rd_atomic for the error output. Found by Linux Verification Center (linuxtesting.org) with Svace. Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs") Signed-off-by: Vitaliy Shevtsov Link: https://lore.kernel.org/stable/20240916165817.14691-1-v.shevtsov%40maxima.ru Link: https://patch.msgid.link/20240916165817.14691-1-v.shevtsov@maxima.ru Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 6a107decb704..eeb932e58730 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -1347,7 +1347,7 @@ int irdma_modify_qp_roce(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (attr->max_dest_rd_atomic > dev->hw_attrs.max_hw_ird) { ibdev_err(&iwdev->ibdev, "rd_atomic = %d, above max_hw_ird=%d\n", - attr->max_rd_atomic, + attr->max_dest_rd_atomic, dev->hw_attrs.max_hw_ird); return -EINVAL; } From 7acad3c442df6d5158c5b732a7a0ccf3a01d9b30 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 16 Sep 2024 06:24:34 -0700 Subject: [PATCH 98/99] RDMA/nldev: Add missing break in rdma_nl_notify_err_msg() Clang warns (or errors with CONFIG_WERROR=y): drivers/infiniband/core/nldev.c:2795:2: error: unannotated fall-through between switch labels [-Werror,-Wimplicit-fallthrough] 2795 | default: | ^ Clang is a little more pedantic than GCC, which does not warn when falling through to a case that is just break or return. Clang's version is more in line with the kernel's own stance in deprecated.rst, which states that all switch/case blocks must end in either break, fallthrough, continue, goto, or return. Add the missing break to silence the warning. Fixes: 9cbed5aab5ae ("RDMA/nldev: Add support for RDMA monitoring") Signed-off-by: Nathan Chancellor Link: https://patch.msgid.link/20240916-rdma-fix-clang-fallthrough-nl_notify_err_msg-v1-1-89de6a7423f1@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 10b1411ac53d..39f89a4b8649 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -2792,6 +2792,7 @@ static void rdma_nl_notify_err_msg(struct ib_device *device, u32 port_num, dev_warn_ratelimited(&device->dev, "Failed to send RDMA monitor netdev detach event: port %d\n", port_num); + break; default: break; } From 70920941923316b760bc7a804eb3d49a126d8712 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 18 Sep 2024 10:16:32 +0800 Subject: [PATCH 99/99] RDMA/bnxt_re: Remove the unused variable en_dev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Variable en_dev is not effectively used, so delete it. drivers/infiniband/hw/bnxt_re/main.c:1980:22: warning: variable ‘en_dev’ set but not used. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=10867 Signed-off-by: Jiapeng Chong Link: https://patch.msgid.link/20240918021632.36091-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index adff9e494c9d..777068de4bbc 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1977,7 +1977,6 @@ static void bnxt_re_remove_device(struct bnxt_re_dev *rdev, u8 op_type, static void bnxt_re_remove(struct auxiliary_device *adev) { struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev); - struct bnxt_en_dev *en_dev; struct bnxt_re_dev *rdev; mutex_lock(&bnxt_re_mutex); @@ -1985,7 +1984,6 @@ static void bnxt_re_remove(struct auxiliary_device *adev) mutex_unlock(&bnxt_re_mutex); return; } - en_dev = en_info->en_dev; rdev = en_info->rdev; if (rdev)