Whamcloud - gitweb
LU-519 o2iblnd: check wr_id returned by ib_poll_cq 47/12747/3
authorLiang Zhen <liang.zhen@intel.com>
Mon, 17 Nov 2014 06:35:25 +0000 (14:35 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Tue, 12 Jan 2016 02:45:02 +0000 (02:45 +0000)
If ib_poll_cq returned +ve without initialising ib_wc::wr_id (bug
in driver), then o2iblnd will run into unpredictable situation
because ib_wc::wr_id may refer to stale tx/rx pointer in stack.

It indicates bug in HCA driver if this happened, ko2iblnd should
output console error then close current connection.

This patch could also be helpful for LU-5271

Signed-off-by: Liang Zhen <liang.zhen@intel.com>
Change-Id: I7851e009bb6cd7df3c299b23b6f338b86ba73b68
Reviewed-on: http://review.whamcloud.com/12747
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Isaac Huang <he.huang@intel.com>
Reviewed-by: Doug Oucharek <doug.s.oucharek@intel.com>
Reviewed-by: James Simmons <uja.ornl@yahoo.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lnet/klnds/o2iblnd/o2iblnd.h
lnet/klnds/o2iblnd/o2iblnd_cb.c

index 5f5d10d..715f560 100644 (file)
@@ -910,9 +910,10 @@ kiblnd_queue2str(kib_conn_t *conn, struct list_head *q)
 /* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the
  * lowest bits of the work request id to stash the work item type. */
 
 /* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the
  * lowest bits of the work request id to stash the work item type. */
 
-#define IBLND_WID_TX    0
-#define IBLND_WID_RDMA  1
+#define IBLND_WID_INVAL 0
+#define IBLND_WID_TX    1
 #define IBLND_WID_RX    2
 #define IBLND_WID_RX    2
+#define IBLND_WID_RDMA  3
 #define IBLND_WID_MASK  3UL
 
 static inline __u64
 #define IBLND_WID_MASK  3UL
 
 static inline __u64
index 9567d0e..42b9d7a 100644 (file)
@@ -768,7 +768,6 @@ __must_hold(&conn->ibc_lock)
         int                ver = conn->ibc_version;
         int                rc;
         int                done;
         int                ver = conn->ibc_version;
         int                rc;
         int                done;
-        struct ib_send_wr *bad_wrq;
 
        LASSERT(tx->tx_queued);
        /* We rely on this for QP sizing */
 
        LASSERT(tx->tx_queued);
        /* We rely on this for QP sizing */
@@ -848,9 +847,15 @@ __must_hold(&conn->ibc_lock)
                 /* close_conn will launch failover */
                 rc = -ENETDOWN;
         } else {
                 /* close_conn will launch failover */
                 rc = -ENETDOWN;
         } else {
-                rc = ib_post_send(conn->ibc_cmid->qp,
-                                  tx->tx_wrq, &bad_wrq);
-        }
+               struct ib_send_wr *wrq = &tx->tx_wrq[tx->tx_nwrq - 1];
+
+               LASSERTF(wrq->wr_id == kiblnd_ptr2wreqid(tx, IBLND_WID_TX),
+                        "bad wr_id "LPX64", opc %d, flags %d, peer: %s\n",
+                        wrq->wr_id, wrq->opcode, wrq->send_flags,
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
+               wrq = NULL;
+               rc = ib_post_send(conn->ibc_cmid->qp, tx->tx_wrq, &wrq);
+       }
 
         conn->ibc_last_send = jiffies;
 
 
         conn->ibc_last_send = jiffies;
 
@@ -3398,6 +3403,8 @@ kiblnd_scheduler(void *arg)
 
                        spin_unlock_irqrestore(&sched->ibs_lock, flags);
 
 
                        spin_unlock_irqrestore(&sched->ibs_lock, flags);
 
+                       wc.wr_id = IBLND_WID_INVAL;
+
                         rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
                         if (rc == 0) {
                                 rc = ib_req_notify_cq(conn->ibc_cq,
                         rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
                         if (rc == 0) {
                                 rc = ib_req_notify_cq(conn->ibc_cq,
@@ -3416,6 +3423,19 @@ kiblnd_scheduler(void *arg)
                                rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
                        }
 
                                rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
                        }
 
+                       if (unlikely(rc > 0 && wc.wr_id == IBLND_WID_INVAL)) {
+                               LCONSOLE_ERROR(
+                                       "ib_poll_cq (rc: %d) returned invalid "
+                                       "wr_id, opcode %d, status: %d, "
+                                       "vendor_err: %d, conn: %s status: %d\n"
+                                       "please upgrade firmware and OFED or "
+                                       "contact vendor.\n", rc,
+                                       wc.opcode, wc.status, wc.vendor_err,
+                                       libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                                       conn->ibc_state);
+                               rc = -EINVAL;
+                       }
+
                        if (rc < 0) {
                                CWARN("%s: ib_poll_cq failed: %d, "
                                      "closing connection\n",
                        if (rc < 0) {
                                CWARN("%s: ib_poll_cq failed: %d, "
                                      "closing connection\n",