if (lntmsg[i] == NULL)
continue;
+ /* propagate health status to LNet for requests */
+ if (i == 0 && lntmsg[i])
+ lntmsg[i]->msg_health_status = tx->tx_hstatus;
+
lnet_finalize(lntmsg[i], rc);
}
}
void
-kiblnd_txlist_done(struct list_head *txlist, int status)
+kiblnd_txlist_done(struct list_head *txlist, int status,
+ enum lnet_msg_hstatus hstatus)
{
struct kib_tx *tx;
/* complete now */
tx->tx_waiting = 0;
tx->tx_status = status;
+ if (hstatus != LNET_MSG_STATUS_OK)
+ tx->tx_hstatus = hstatus;
kiblnd_tx_done(tx);
}
}
LASSERT (tx->tx_nfrags == 0);
tx->tx_gaps = false;
+ tx->tx_hstatus = LNET_MSG_STATUS_OK;
return tx;
}
* own this rx (and rx::rx_conn) anymore, LU-5678.
*/
kiblnd_conn_addref(conn);
+#ifdef HAVE_IB_POST_SEND_RECV_CONST
+ rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq,
+ (const struct ib_recv_wr **)&bad_wrq);
+#else
rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, &bad_wrq);
+#endif
if (unlikely(rc != 0)) {
CERROR("Can't post rx for %s: %d, bad_wrq: %p\n",
libcfs_nid2str(conn->ibc_peer->ibp_nid), rc, bad_wrq);
spin_unlock(&conn->ibc_lock);
CWARN("Unmatched completion type %x cookie %#llx from %s\n",
- txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
- kiblnd_close_conn(conn, -EPROTO);
- return;
- }
+ txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ kiblnd_close_conn(conn, -EPROTO);
+ return;
+ }
- if (tx->tx_status == 0) { /* success so far */
- if (status < 0) { /* failed? */
- tx->tx_status = status;
- } else if (txtype == IBLND_MSG_GET_REQ) {
- lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
- }
- }
+ if (tx->tx_status == 0) { /* success so far */
+ if (status < 0) { /* failed? */
+ tx->tx_status = status;
+ tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_ERROR;
+ } else if (txtype == IBLND_MSG_GET_REQ) {
+ lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
+ }
+ }
- tx->tx_waiting = 0;
+ tx->tx_waiting = 0;
- idle = !tx->tx_queued && (tx->tx_sending == 0);
- if (idle)
+ idle = !tx->tx_queued && (tx->tx_sending == 0);
+ if (idle)
list_del(&tx->tx_list);
spin_unlock(&conn->ibc_lock);
{
struct kib_msg *msg = tx->tx_msg;
struct kib_peer_ni *peer_ni = conn->ibc_peer;
+ struct lnet_ni *ni = peer_ni->ibp_ni;
int ver = conn->ibc_version;
int rc;
int done;
LASSERT(conn->ibc_credits <= conn->ibc_queue_depth);
if (conn->ibc_nsends_posted ==
- conn->ibc_queue_depth) {
+ kiblnd_concurrent_sends(ver, ni)) {
/* tx completions outstanding... */
CDEBUG(D_NET, "%s: posted enough\n",
libcfs_nid2str(peer_ni->ibp_nid));
* kiblnd_check_sends_locked will queue NOOP again when
* posted NOOPs complete */
spin_unlock(&conn->ibc_lock);
+ tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
kiblnd_tx_done(tx);
spin_lock(&conn->ibc_lock);
CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n",
libcfs_nid2str(conn->ibc_peer->ibp_nid));
bad = NULL;
- rc = ib_post_send(conn->ibc_cmid->qp, wr, &bad);
+ if (lnet_send_error_simulation(tx->tx_lntmsg[0], &tx->tx_hstatus))
+ rc = -EINVAL;
+ else
+#ifdef HAVE_IB_POST_SEND_RECV_CONST
+ rc = ib_post_send(conn->ibc_cmid->qp, wr,
+ (const struct ib_send_wr **)&bad);
+#else
+ rc = ib_post_send(conn->ibc_cmid->qp, wr, &bad);
+#endif
}
conn->ibc_last_send = ktime_get();
}
LASSERT(conn->ibc_nsends_posted <=
- conn->ibc_queue_depth);
+ kiblnd_concurrent_sends(ver, ni));
LASSERT (!IBLND_OOB_CAPABLE(ver) ||
conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver));
LASSERT (conn->ibc_reserved_credits >= 0);
conn->ibc_noops_posted--;
if (failed) {
+ tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_DROPPED;
tx->tx_waiting = 0; /* don't wait for peer_ni */
tx->tx_status = -EIO;
}
LASSERT(!tx->tx_queued); /* not queued for sending already */
LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
- timeout_ns = *kiblnd_tunables.kib_timeout * NSEC_PER_SEC;
+ if (conn->ibc_state >= IBLND_CONN_DISCONNECTED) {
+ tx->tx_status = -ECONNABORTED;
+ tx->tx_waiting = 0;
+ if (tx->tx_conn != NULL) {
+ /* PUT_DONE first attached to conn as a PUT_REQ */
+ LASSERT(tx->tx_conn == conn);
+ LASSERT(tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE);
+ tx->tx_conn = NULL;
+ kiblnd_conn_decref(conn);
+ }
+ list_add(&tx->tx_list, &conn->ibc_zombie_txs);
+
+ return;
+ }
+
+ timeout_ns = lnet_get_lnd_timeout() * NSEC_PER_SEC;
tx->tx_queued = 1;
tx->tx_deadline = ktime_add_ns(ktime_get(), timeout_ns);
LASSERT (net != NULL);
LASSERT (peer_ni->ibp_connecting > 0);
- cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, peer_ni, RDMA_PS_TCP,
- IB_QPT_RC);
+ cmid = kiblnd_rdma_create_id(peer_ni->ibp_ni->ni_net_ns,
+ kiblnd_cm_callback, peer_ni,
+ RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(cmid)) {
CERROR("Can't create CMID for %s: %ld\n",
kiblnd_peer_addref(peer_ni); /* cmid's ref */
- if (*kiblnd_tunables.kib_use_priv_port) {
- rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
- *kiblnd_tunables.kib_timeout * 1000);
- } else {
- rc = rdma_resolve_addr(cmid,
- (struct sockaddr *)&srcaddr,
- (struct sockaddr *)&dstaddr,
- *kiblnd_tunables.kib_timeout * 1000);
- }
- if (rc != 0) {
- /* Can't initiate address resolution: */
- CERROR("Can't resolve addr for %s: %d\n",
- libcfs_nid2str(peer_ni->ibp_nid), rc);
- goto failed2;
- }
+ if (*kiblnd_tunables.kib_use_priv_port) {
+ rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
+ lnet_get_lnd_timeout() * 1000);
+ } else {
+ rc = rdma_resolve_addr(cmid,
+ (struct sockaddr *)&srcaddr,
+ (struct sockaddr *)&dstaddr,
+ lnet_get_lnd_timeout() * 1000);
+ }
+ if (rc != 0) {
+ /* Can't initiate address resolution: */
+ CERROR("Can't resolve addr for %s: %d\n",
+ libcfs_nid2str(peer_ni->ibp_nid), rc);
+ goto failed2;
+ }
return;
CWARN("Abort reconnection of %s: %s\n",
libcfs_nid2str(peer_ni->ibp_nid), reason);
- kiblnd_txlist_done(&txs, -ECONNABORTED);
+ kiblnd_txlist_done(&txs, -ECONNABORTED,
+ LNET_MSG_STATUS_LOCAL_ABORTED);
return false;
}
if (tx != NULL) {
tx->tx_status = -EHOSTUNREACH;
tx->tx_waiting = 0;
+ tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
kiblnd_tx_done(tx);
}
return;
if (rc != 0) {
CERROR("Can't setup GET sink for %s: %d\n",
libcfs_nid2str(target.nid), rc);
+ tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
kiblnd_tx_done(tx);
return -EIO;
}
kiblnd_queue_tx(tx, rx->rx_conn);
return;
- failed_1:
+
+failed_1:
+ tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
kiblnd_tx_done(tx);
- failed_0:
+failed_0:
lnet_finalize(lntmsg, -EIO);
}
if (rc != 0) {
CERROR("Can't setup PUT sink for %s: %d\n",
libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+ tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
kiblnd_tx_done(tx);
/* tell peer_ni it's over */
kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK,
static void
kiblnd_peer_notify(struct kib_peer_ni *peer_ni)
{
- int error = 0;
+ int error = 0;
time64_t last_alive = 0;
- unsigned long flags;
+ unsigned long flags;
read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
if (kiblnd_peer_idle(peer_ni) && peer_ni->ibp_error != 0) {
- error = peer_ni->ibp_error;
- peer_ni->ibp_error = 0;
+ error = peer_ni->ibp_error;
+ peer_ni->ibp_error = 0;
- last_alive = peer_ni->ibp_last_alive;
- }
+ last_alive = peer_ni->ibp_last_alive;
+ }
read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
- if (error != 0)
- lnet_notify(peer_ni->ibp_ni,
- peer_ni->ibp_nid, 0, last_alive);
+ if (error != 0)
+ lnet_notify(peer_ni->ibp_ni,
+ peer_ni->ibp_nid, false, false, last_alive);
}
void
write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
}
-static void
+void
kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs)
{
- struct list_head zombies = LIST_HEAD_INIT(zombies);
+ LIST_HEAD(zombies);
struct list_head *tmp;
struct list_head *nxt;
struct kib_tx *tx;
LASSERT(!tx->tx_queued);
LASSERT(tx->tx_waiting ||
tx->tx_sending != 0);
+ if (conn->ibc_comms_error == -ETIMEDOUT) {
+ if (tx->tx_waiting && !tx->tx_sending)
+ tx->tx_hstatus =
+ LNET_MSG_STATUS_REMOTE_TIMEOUT;
+ else if (tx->tx_sending)
+ tx->tx_hstatus =
+ LNET_MSG_STATUS_NETWORK_TIMEOUT;
+ }
} else {
LASSERT(tx->tx_queued);
+ if (conn->ibc_comms_error == -ETIMEDOUT)
+ tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_TIMEOUT;
+ else
+ tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
}
tx->tx_status = -ECONNABORTED;
tx->tx_waiting = 0;
+ /*
+ * TODO: This makes an assumption that
+ * kiblnd_tx_complete() will be called for each tx. If
+ * that event is dropped we could end up with stale
+ * connections floating around. We'd like to deal with
+ * that in a better way.
+ *
+ * Also that means we can exceed the timeout by many
+ * seconds.
+ */
if (tx->tx_sending == 0) {
tx->tx_queued = 0;
list_del(&tx->tx_list);
spin_unlock(&conn->ibc_lock);
- kiblnd_txlist_done(&zombies, -ECONNABORTED);
+ /*
+ * aborting transmits occurs when finalizing the connection.
+ * The connection is finalized on error.
+ * Passing LNET_MSG_STATUS_OK to txlist_done() will not
+ * override the value already set in tx->tx_hstatus above.
+ */
+ kiblnd_txlist_done(&zombies, -ECONNABORTED, LNET_MSG_STATUS_OK);
}
static void
LASSERT (!in_interrupt());
LASSERT (conn->ibc_state > IBLND_CONN_INIT);
- kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
-
/* abort_receives moves QP state to IB_QPS_ERR. This is only required
* for connections that didn't get as far as being connected, because
* rdma_disconnect() does this for free. */
kiblnd_abort_receives(conn);
+ kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
+
/* Complete all tx descs not waiting for sends to complete.
* NB we should be safe from RDMA now that the QP has changed state */
kiblnd_peer_connect_failed(struct kib_peer_ni *peer_ni, int active,
int error)
{
- struct list_head zombies = LIST_HEAD_INIT(zombies);
+ LIST_HEAD(zombies);
unsigned long flags;
LASSERT (error != 0);
CNETERR("Deleting messages for %s: connection failed\n",
libcfs_nid2str(peer_ni->ibp_nid));
- kiblnd_txlist_done(&zombies, -EHOSTUNREACH);
+ kiblnd_txlist_done(&zombies, error,
+ LNET_MSG_STATUS_LOCAL_DROPPED);
}
static void
kiblnd_close_conn_locked(conn, -ECONNABORTED);
write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
- kiblnd_txlist_done(&txs, -ECONNABORTED);
+ kiblnd_txlist_done(&txs, -ECONNABORTED,
+ LNET_MSG_STATUS_LOCAL_ERROR);
return;
}
CNETERR("Can't resolve address for %s: %d\n",
libcfs_nid2str(peer_ni->ibp_nid), event->status);
rc = event->status;
- } else {
- rc = rdma_resolve_route(
- cmid, *kiblnd_tunables.kib_timeout * 1000);
+ } else {
+ rc = rdma_resolve_route(
+ cmid, lnet_get_lnd_timeout() * 1000);
if (rc == 0) {
struct kib_net *net = peer_ni->ibp_ni->ni_data;
struct kib_dev *dev = net->ibn_dev;
static void
kiblnd_check_conns (int idx)
{
- struct list_head closes = LIST_HEAD_INIT(closes);
- struct list_head checksends = LIST_HEAD_INIT(checksends);
- struct list_head timedout_txs = LIST_HEAD_INIT(timedout_txs);
+ LIST_HEAD(closes);
+ LIST_HEAD(checksends);
+ LIST_HEAD(timedout_txs);
struct list_head *peers = &kiblnd_data.kib_peers[idx];
struct list_head *ptmp;
struct kib_peer_ni *peer_ni;
write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
if (!list_empty(&timedout_txs))
- kiblnd_txlist_done(&timedout_txs, -ETIMEDOUT);
+ kiblnd_txlist_done(&timedout_txs, -ETIMEDOUT,
+ LNET_MSG_STATUS_LOCAL_TIMEOUT);
/* Handle timeout by closing the whole
* connection. We can only be sure RDMA activity
const int n = 4;
const int p = 1;
int chunk = kiblnd_data.kib_peer_hash_size;
+ unsigned int lnd_timeout;
spin_unlock_irqrestore(lock, flags);
dropped_lock = 1;
* connection within (n+1)/n times the timeout
* interval. */
- if (*kiblnd_tunables.kib_timeout > n * p)
- chunk = (chunk * n * p) /
- *kiblnd_tunables.kib_timeout;
- if (chunk == 0)
- chunk = 1;
+ lnd_timeout = lnet_get_lnd_timeout();
+ if (lnd_timeout > n * p)
+ chunk = (chunk * n * p) / lnd_timeout;
+ if (chunk == 0)
+ chunk = 1;
for (i = 0; i < chunk; i++) {
kiblnd_check_conns(peer_index);
kiblnd_data.kib_peer_hash_size;
}
- deadline += msecs_to_jiffies(p * MSEC_PER_SEC);
+ deadline += cfs_time_seconds(p);
spin_lock_irqsave(lock, flags);
}
{
struct kib_conn *conn = arg;
- switch (event->event) {
- case IB_EVENT_COMM_EST:
- CDEBUG(D_NET, "%s established\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ switch (event->event) {
+ case IB_EVENT_COMM_EST:
+ CDEBUG(D_NET, "%s established\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
/* We received a packet but connection isn't established
* probably handshake packet was lost, so free to
* force make connection established */
rdma_notify(conn->ibc_cmid, IB_EVENT_COMM_EST);
- return;
+ return;
- default:
- CERROR("%s: Async QP event type %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
- return;
- }
+ case IB_EVENT_PORT_ERR:
+ case IB_EVENT_DEVICE_FATAL:
+ CERROR("Fatal device error for NI %s\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_ni->ni_nid));
+ atomic_set(&conn->ibc_peer->ibp_ni->ni_fatal_error_on, 1);
+ return;
+
+ case IB_EVENT_PORT_ACTIVE:
+ CERROR("Port reactivated for NI %s\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_ni->ni_nid));
+ atomic_set(&conn->ibc_peer->ibp_ni->ni_fatal_error_on, 0);
+ return;
+
+ default:
+ CERROR("%s: Async QP event type %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+ return;
+ }
}
static void
{
rwlock_t *glock = &kiblnd_data.kib_global_lock;
struct kib_dev *dev;
+ struct net *ns = arg;
wait_queue_entry_t wait;
unsigned long flags;
int rc;
dev->ibd_failover = 1;
write_unlock_irqrestore(glock, flags);
- rc = kiblnd_dev_failover(dev);
+ rc = kiblnd_dev_failover(dev, ns);
write_lock_irqsave(glock, flags);