From: Liang Zhen Date: Mon, 17 Nov 2014 06:35:25 +0000 (+0800) Subject: LU-519 o2iblnd: check wr_id returned by ib_poll_cq X-Git-Tag: 2.7.66~53 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=refs%2Fchanges%2F47%2F12747%2F3;p=fs%2Flustre-release.git LU-519 o2iblnd: check wr_id returned by ib_poll_cq If ib_poll_cq returned +ve without initialising ib_wc::wr_id (bug in driver), then o2iblnd will run into unpredictable situation because ib_wc::wr_id may refer to stale tx/rx pointer in stack. It indicates bug in HCA driver if this happened, ko2iblnd should output console error then close current connection. This patch could also be helpful for LU-5271 Signed-off-by: Liang Zhen Change-Id: I7851e009bb6cd7df3c299b23b6f338b86ba73b68 Reviewed-on: http://review.whamcloud.com/12747 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Isaac Huang Reviewed-by: Doug Oucharek Reviewed-by: James Simmons Reviewed-by: Oleg Drokin --- diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index 5f5d10d..715f560 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -910,9 +910,10 @@ kiblnd_queue2str(kib_conn_t *conn, struct list_head *q) /* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the * lowest bits of the work request id to stash the work item type. */ -#define IBLND_WID_TX 0 -#define IBLND_WID_RDMA 1 +#define IBLND_WID_INVAL 0 +#define IBLND_WID_TX 1 #define IBLND_WID_RX 2 +#define IBLND_WID_RDMA 3 #define IBLND_WID_MASK 3UL static inline __u64 diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index 9567d0e..42b9d7a 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -768,7 +768,6 @@ __must_hold(&conn->ibc_lock) int ver = conn->ibc_version; int rc; int done; - struct ib_send_wr *bad_wrq; LASSERT(tx->tx_queued); /* We rely on this for QP sizing */ @@ -848,9 +847,15 @@ __must_hold(&conn->ibc_lock) /* close_conn will launch failover */ rc = -ENETDOWN; } else { - rc = ib_post_send(conn->ibc_cmid->qp, - tx->tx_wrq, &bad_wrq); - } + struct ib_send_wr *wrq = &tx->tx_wrq[tx->tx_nwrq - 1]; + + LASSERTF(wrq->wr_id == kiblnd_ptr2wreqid(tx, IBLND_WID_TX), + "bad wr_id "LPX64", opc %d, flags %d, peer: %s\n", + wrq->wr_id, wrq->opcode, wrq->send_flags, + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + wrq = NULL; + rc = ib_post_send(conn->ibc_cmid->qp, tx->tx_wrq, &wrq); + } conn->ibc_last_send = jiffies; @@ -3398,6 +3403,8 @@ kiblnd_scheduler(void *arg) spin_unlock_irqrestore(&sched->ibs_lock, flags); + wc.wr_id = IBLND_WID_INVAL; + rc = ib_poll_cq(conn->ibc_cq, 1, &wc); if (rc == 0) { rc = ib_req_notify_cq(conn->ibc_cq, @@ -3416,6 +3423,19 @@ kiblnd_scheduler(void *arg) rc = ib_poll_cq(conn->ibc_cq, 1, &wc); } + if (unlikely(rc > 0 && wc.wr_id == IBLND_WID_INVAL)) { + LCONSOLE_ERROR( + "ib_poll_cq (rc: %d) returned invalid " + "wr_id, opcode %d, status: %d, " + "vendor_err: %d, conn: %s status: %d\n" + "please upgrade firmware and OFED or " + "contact vendor.\n", rc, + wc.opcode, wc.status, wc.vendor_err, + libcfs_nid2str(conn->ibc_peer->ibp_nid), + conn->ibc_state); + rc = -EINVAL; + } + if (rc < 0) { CWARN("%s: ib_poll_cq failed: %d, " "closing connection\n",