2
0
mirror of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2025-09-04 20:19:47 +08:00

xprtrdma: Split the completion queue

The current CQ handler uses the ib_wc.opcode field to distinguish
between event types. However, the contents of that field are not
reliable if the completion status is not IB_WC_SUCCESS.

When an error completion occurs on a send event, the CQ handler
schedules a tasklet with something that is not a struct rpcrdma_rep.
This is never correct behavior, and sometimes it results in a panic.

To resolve this issue, split the completion queue into a send CQ and
a receive CQ. The send CQ handler now handles only struct rpcrdma_mw
wr_id's, and the receive CQ handler now handles only struct
rpcrdma_rep wr_id's.

Fix suggested by Shirley Ma <shirley.ma@oracle.com>

Reported-by: Rafael Reiter <rafael.reiter@ims.co.at>
Fixes: 5c635e09ce
BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=73211
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Klemens Senn <klemens.senn@ims.co.at>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
This commit is contained in:
Chuck Lever 2014-05-28 10:33:25 -04:00 committed by Anna Schumaker
parent 7f1d54191e
commit fc66448549
2 changed files with 144 additions and 99 deletions

View File

@ -142,96 +142,115 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
} }
} }
static inline static void
void rpcrdma_event_process(struct ib_wc *wc) rpcrdma_sendcq_process_wc(struct ib_wc *wc)
{ {
struct rpcrdma_mw *frmr; struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
struct rpcrdma_rep *rep =
(struct rpcrdma_rep *)(unsigned long) wc->wr_id;
dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n", dprintk("RPC: %s: frmr %p status %X opcode %d\n",
__func__, rep, wc->status, wc->opcode, wc->byte_len); __func__, frmr, wc->status, wc->opcode);
if (!rep) /* send completion that we don't care about */ if (wc->wr_id == 0ULL)
return;
if (wc->status != IB_WC_SUCCESS)
return; return;
if (IB_WC_SUCCESS != wc->status) { if (wc->opcode == IB_WC_FAST_REG_MR)
dprintk("RPC: %s: WC opcode %d status %X, connection lost\n",
__func__, wc->opcode, wc->status);
rep->rr_len = ~0U;
if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV)
rpcrdma_schedule_tasklet(rep);
return;
}
switch (wc->opcode) {
case IB_WC_FAST_REG_MR:
frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
frmr->r.frmr.state = FRMR_IS_VALID; frmr->r.frmr.state = FRMR_IS_VALID;
break; else if (wc->opcode == IB_WC_LOCAL_INV)
case IB_WC_LOCAL_INV:
frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
frmr->r.frmr.state = FRMR_IS_INVALID; frmr->r.frmr.state = FRMR_IS_INVALID;
break;
case IB_WC_RECV:
rep->rr_len = wc->byte_len;
ib_dma_sync_single_for_cpu(
rdmab_to_ia(rep->rr_buffer)->ri_id->device,
rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
/* Keep (only) the most recent credits, after check validity */
if (rep->rr_len >= 16) {
struct rpcrdma_msg *p =
(struct rpcrdma_msg *) rep->rr_base;
unsigned int credits = ntohl(p->rm_credit);
if (credits == 0) {
dprintk("RPC: %s: server"
" dropped credits to 0!\n", __func__);
/* don't deadlock */
credits = 1;
} else if (credits > rep->rr_buffer->rb_max_requests) {
dprintk("RPC: %s: server"
" over-crediting: %d (%d)\n",
__func__, credits,
rep->rr_buffer->rb_max_requests);
credits = rep->rr_buffer->rb_max_requests;
}
atomic_set(&rep->rr_buffer->rb_credits, credits);
}
rpcrdma_schedule_tasklet(rep);
break;
default:
dprintk("RPC: %s: unexpected WC event %X\n",
__func__, wc->opcode);
break;
}
} }
static inline int static int
rpcrdma_cq_poll(struct ib_cq *cq) rpcrdma_sendcq_poll(struct ib_cq *cq)
{ {
struct ib_wc wc; struct ib_wc wc;
int rc; int rc;
for (;;) { while ((rc = ib_poll_cq(cq, 1, &wc)) == 1)
rc = ib_poll_cq(cq, 1, &wc); rpcrdma_sendcq_process_wc(&wc);
if (rc < 0) { return rc;
dprintk("RPC: %s: ib_poll_cq failed %i\n",
__func__, rc);
return rc;
}
if (rc == 0)
break;
rpcrdma_event_process(&wc);
}
return 0;
} }
/* /*
* rpcrdma_cq_event_upcall * Handle send, fast_reg_mr, and local_inv completions.
*
* Send events are typically suppressed and thus do not result
* in an upcall. Occasionally one is signaled, however. This
* prevents the provider's completion queue from wrapping and
* losing a completion.
*/
static void
rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
{
int rc;
rc = rpcrdma_sendcq_poll(cq);
if (rc) {
dprintk("RPC: %s: ib_poll_cq failed: %i\n",
__func__, rc);
return;
}
rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
if (rc) {
dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
__func__, rc);
return;
}
rpcrdma_sendcq_poll(cq);
}
static void
rpcrdma_recvcq_process_wc(struct ib_wc *wc)
{
struct rpcrdma_rep *rep =
(struct rpcrdma_rep *)(unsigned long)wc->wr_id;
dprintk("RPC: %s: rep %p status %X opcode %X length %u\n",
__func__, rep, wc->status, wc->opcode, wc->byte_len);
if (wc->status != IB_WC_SUCCESS) {
rep->rr_len = ~0U;
goto out_schedule;
}
if (wc->opcode != IB_WC_RECV)
return;
rep->rr_len = wc->byte_len;
ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
if (rep->rr_len >= 16) {
struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base;
unsigned int credits = ntohl(p->rm_credit);
if (credits == 0)
credits = 1; /* don't deadlock */
else if (credits > rep->rr_buffer->rb_max_requests)
credits = rep->rr_buffer->rb_max_requests;
atomic_set(&rep->rr_buffer->rb_credits, credits);
}
out_schedule:
rpcrdma_schedule_tasklet(rep);
}
static int
rpcrdma_recvcq_poll(struct ib_cq *cq)
{
struct ib_wc wc;
int rc;
while ((rc = ib_poll_cq(cq, 1, &wc)) == 1)
rpcrdma_recvcq_process_wc(&wc);
return rc;
}
/*
* Handle receive completions.
* *
* This upcall handles recv and send events.
* It is reentrant but processes single events in order to maintain * It is reentrant but processes single events in order to maintain
* ordering of receives to keep server credits. * ordering of receives to keep server credits.
* *
@ -240,26 +259,27 @@ rpcrdma_cq_poll(struct ib_cq *cq)
* connection shutdown. That is, the structures required for * connection shutdown. That is, the structures required for
* the completion of the reply handler must remain intact until * the completion of the reply handler must remain intact until
* all memory has been reclaimed. * all memory has been reclaimed.
*
* Note that send events are suppressed and do not result in an upcall.
*/ */
static void static void
rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context) rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
{ {
int rc; int rc;
rc = rpcrdma_cq_poll(cq); rc = rpcrdma_recvcq_poll(cq);
if (rc)
return;
rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
if (rc) { if (rc) {
dprintk("RPC: %s: ib_req_notify_cq failed %i\n", dprintk("RPC: %s: ib_poll_cq failed: %i\n",
__func__, rc); __func__, rc);
return; return;
} }
rpcrdma_cq_poll(cq); rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
if (rc) {
dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
__func__, rc);
return;
}
rpcrdma_recvcq_poll(cq);
} }
#ifdef RPC_DEBUG #ifdef RPC_DEBUG
@ -610,6 +630,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
struct rpcrdma_create_data_internal *cdata) struct rpcrdma_create_data_internal *cdata)
{ {
struct ib_device_attr devattr; struct ib_device_attr devattr;
struct ib_cq *sendcq, *recvcq;
int rc, err; int rc, err;
rc = ib_query_device(ia->ri_id->device, &devattr); rc = ib_query_device(ia->ri_id->device, &devattr);
@ -685,7 +706,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_attr.cap.max_recv_sge); ep->rep_attr.cap.max_recv_sge);
/* set trigger for requesting send completion */ /* set trigger for requesting send completion */
ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/; ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
if (ep->rep_cqinit <= 2) if (ep->rep_cqinit <= 2)
ep->rep_cqinit = 0; ep->rep_cqinit = 0;
INIT_CQCOUNT(ep); INIT_CQCOUNT(ep);
@ -693,26 +714,43 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
init_waitqueue_head(&ep->rep_connect_wait); init_waitqueue_head(&ep->rep_connect_wait);
INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall, sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
rpcrdma_cq_async_error_upcall, NULL, rpcrdma_cq_async_error_upcall, NULL,
ep->rep_attr.cap.max_recv_wr +
ep->rep_attr.cap.max_send_wr + 1, 0); ep->rep_attr.cap.max_send_wr + 1, 0);
if (IS_ERR(ep->rep_cq)) { if (IS_ERR(sendcq)) {
rc = PTR_ERR(ep->rep_cq); rc = PTR_ERR(sendcq);
dprintk("RPC: %s: ib_create_cq failed: %i\n", dprintk("RPC: %s: failed to create send CQ: %i\n",
__func__, rc); __func__, rc);
goto out1; goto out1;
} }
rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP); rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
if (rc) { if (rc) {
dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
__func__, rc); __func__, rc);
goto out2; goto out2;
} }
ep->rep_attr.send_cq = ep->rep_cq; recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
ep->rep_attr.recv_cq = ep->rep_cq; rpcrdma_cq_async_error_upcall, NULL,
ep->rep_attr.cap.max_recv_wr + 1, 0);
if (IS_ERR(recvcq)) {
rc = PTR_ERR(recvcq);
dprintk("RPC: %s: failed to create recv CQ: %i\n",
__func__, rc);
goto out2;
}
rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
if (rc) {
dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
__func__, rc);
ib_destroy_cq(recvcq);
goto out2;
}
ep->rep_attr.send_cq = sendcq;
ep->rep_attr.recv_cq = recvcq;
/* Initialize cma parameters */ /* Initialize cma parameters */
@ -734,7 +772,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
return 0; return 0;
out2: out2:
err = ib_destroy_cq(ep->rep_cq); err = ib_destroy_cq(sendcq);
if (err) if (err)
dprintk("RPC: %s: ib_destroy_cq returned %i\n", dprintk("RPC: %s: ib_destroy_cq returned %i\n",
__func__, err); __func__, err);
@ -774,8 +812,14 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
ep->rep_pad_mr = NULL; ep->rep_pad_mr = NULL;
} }
rpcrdma_clean_cq(ep->rep_cq); rpcrdma_clean_cq(ep->rep_attr.recv_cq);
rc = ib_destroy_cq(ep->rep_cq); rc = ib_destroy_cq(ep->rep_attr.recv_cq);
if (rc)
dprintk("RPC: %s: ib_destroy_cq returned %i\n",
__func__, rc);
rpcrdma_clean_cq(ep->rep_attr.send_cq);
rc = ib_destroy_cq(ep->rep_attr.send_cq);
if (rc) if (rc)
dprintk("RPC: %s: ib_destroy_cq returned %i\n", dprintk("RPC: %s: ib_destroy_cq returned %i\n",
__func__, rc); __func__, rc);
@ -798,7 +842,9 @@ retry:
if (rc && rc != -ENOTCONN) if (rc && rc != -ENOTCONN)
dprintk("RPC: %s: rpcrdma_ep_disconnect" dprintk("RPC: %s: rpcrdma_ep_disconnect"
" status %i\n", __func__, rc); " status %i\n", __func__, rc);
rpcrdma_clean_cq(ep->rep_cq);
rpcrdma_clean_cq(ep->rep_attr.recv_cq);
rpcrdma_clean_cq(ep->rep_attr.send_cq);
xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
id = rpcrdma_create_id(xprt, ia, id = rpcrdma_create_id(xprt, ia,
@ -907,7 +953,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{ {
int rc; int rc;
rpcrdma_clean_cq(ep->rep_cq); rpcrdma_clean_cq(ep->rep_attr.recv_cq);
rpcrdma_clean_cq(ep->rep_attr.send_cq);
rc = rdma_disconnect(ia->ri_id); rc = rdma_disconnect(ia->ri_id);
if (!rc) { if (!rc) {
/* returns without wait if not connected */ /* returns without wait if not connected */
@ -1727,7 +1774,6 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
ib_dma_sync_single_for_cpu(ia->ri_id->device, ib_dma_sync_single_for_cpu(ia->ri_id->device,
rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
DECR_CQCOUNT(ep);
rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
if (rc) if (rc)

View File

@ -79,7 +79,6 @@ struct rpcrdma_ep {
int rep_cqinit; int rep_cqinit;
int rep_connected; int rep_connected;
struct rpcrdma_ia *rep_ia; struct rpcrdma_ia *rep_ia;
struct ib_cq *rep_cq;
struct ib_qp_init_attr rep_attr; struct ib_qp_init_attr rep_attr;
wait_queue_head_t rep_connect_wait; wait_queue_head_t rep_connect_wait;
struct ib_sge rep_pad; /* holds zeroed pad */ struct ib_sge rep_pad; /* holds zeroed pad */