If ib_poll_cq returned +ve without initialising ib_wc::wr_id (bug
in driver), then o2iblnd will run into unpredictable situation
because ib_wc::wr_id may refer to stale tx/rx pointer in stack.
It indicates bug in HCA driver if this happened, ko2iblnd should
output console error then close current connection.
This patch could also be helpful for LU-5271
Signed-off-by: Liang Zhen <liang.zhen@intel.com>
Change-Id: I7851e009bb6cd7df3c299b23b6f338b86ba73b68
Reviewed-on: http://review.whamcloud.com/12747
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Isaac Huang <he.huang@intel.com>
Reviewed-by: Doug Oucharek <doug.s.oucharek@intel.com>
Reviewed-by: James Simmons <uja.ornl@yahoo.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the
* lowest bits of the work request id to stash the work item type. */
/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the
* lowest bits of the work request id to stash the work item type. */
-#define IBLND_WID_TX 0
-#define IBLND_WID_RDMA 1
+#define IBLND_WID_INVAL 0
+#define IBLND_WID_TX 1
+#define IBLND_WID_RDMA 3
#define IBLND_WID_MASK 3UL
static inline __u64
#define IBLND_WID_MASK 3UL
static inline __u64
int ver = conn->ibc_version;
int rc;
int done;
int ver = conn->ibc_version;
int rc;
int done;
- struct ib_send_wr *bad_wrq;
LASSERT(tx->tx_queued);
/* We rely on this for QP sizing */
LASSERT(tx->tx_queued);
/* We rely on this for QP sizing */
/* close_conn will launch failover */
rc = -ENETDOWN;
} else {
/* close_conn will launch failover */
rc = -ENETDOWN;
} else {
- rc = ib_post_send(conn->ibc_cmid->qp,
- tx->tx_wrq, &bad_wrq);
- }
+ struct ib_send_wr *wrq = &tx->tx_wrq[tx->tx_nwrq - 1];
+
+ LASSERTF(wrq->wr_id == kiblnd_ptr2wreqid(tx, IBLND_WID_TX),
+ "bad wr_id "LPX64", opc %d, flags %d, peer: %s\n",
+ wrq->wr_id, wrq->opcode, wrq->send_flags,
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ wrq = NULL;
+ rc = ib_post_send(conn->ibc_cmid->qp, tx->tx_wrq, &wrq);
+ }
conn->ibc_last_send = jiffies;
conn->ibc_last_send = jiffies;
spin_unlock_irqrestore(&sched->ibs_lock, flags);
spin_unlock_irqrestore(&sched->ibs_lock, flags);
+ wc.wr_id = IBLND_WID_INVAL;
+
rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
if (rc == 0) {
rc = ib_req_notify_cq(conn->ibc_cq,
rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
if (rc == 0) {
rc = ib_req_notify_cq(conn->ibc_cq,
rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
}
rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
}
+ if (unlikely(rc > 0 && wc.wr_id == IBLND_WID_INVAL)) {
+ LCONSOLE_ERROR(
+ "ib_poll_cq (rc: %d) returned invalid "
+ "wr_id, opcode %d, status: %d, "
+ "vendor_err: %d, conn: %s status: %d\n"
+ "please upgrade firmware and OFED or "
+ "contact vendor.\n", rc,
+ wc.opcode, wc.status, wc.vendor_err,
+ libcfs_nid2str(conn->ibc_peer->ibp_nid),
+ conn->ibc_state);
+ rc = -EINVAL;
+ }
+
if (rc < 0) {
CWARN("%s: ib_poll_cq failed: %d, "
"closing connection\n",
if (rc < 0) {
CWARN("%s: ib_poll_cq failed: %d, "
"closing connection\n",