if (lntmsg[i] == NULL)
continue;
+ /* propagate health status to LNet for requests */
+ if (i == 0 && lntmsg[i])
+ lntmsg[i]->msg_health_status = tx->tx_hstatus;
+
lnet_finalize(lntmsg[i], rc);
}
}
void
-kiblnd_txlist_done(struct list_head *txlist, int status)
+kiblnd_txlist_done(struct list_head *txlist, int status,
+ enum lnet_msg_hstatus hstatus)
{
struct kib_tx *tx;
/* complete now */
tx->tx_waiting = 0;
tx->tx_status = status;
+ tx->tx_hstatus = hstatus;
kiblnd_tx_done(tx);
}
}
LASSERT (tx->tx_nfrags == 0);
tx->tx_gaps = false;
+ tx->tx_hstatus = LNET_MSG_STATUS_OK;
return tx;
}
spin_unlock(&conn->ibc_lock);
CWARN("Unmatched completion type %x cookie %#llx from %s\n",
- txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
- kiblnd_close_conn(conn, -EPROTO);
- return;
- }
+ txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ kiblnd_close_conn(conn, -EPROTO);
+ return;
+ }
- if (tx->tx_status == 0) { /* success so far */
- if (status < 0) { /* failed? */
- tx->tx_status = status;
- } else if (txtype == IBLND_MSG_GET_REQ) {
- lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
- }
- }
+ if (tx->tx_status == 0) { /* success so far */
+ if (status < 0) { /* failed? */
+ tx->tx_status = status;
+ tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_ERROR;
+ } else if (txtype == IBLND_MSG_GET_REQ) {
+ lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
+ }
+ }
- tx->tx_waiting = 0;
+ tx->tx_waiting = 0;
- idle = !tx->tx_queued && (tx->tx_sending == 0);
- if (idle)
+ idle = !tx->tx_queued && (tx->tx_sending == 0);
+ if (idle)
list_del(&tx->tx_list);
spin_unlock(&conn->ibc_lock);
* kiblnd_check_sends_locked will queue NOOP again when
* posted NOOPs complete */
spin_unlock(&conn->ibc_lock);
+ tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
kiblnd_tx_done(tx);
spin_lock(&conn->ibc_lock);
CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n",
conn->ibc_noops_posted--;
if (failed) {
+ tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_DROPPED;
tx->tx_waiting = 0; /* don't wait for peer_ni */
tx->tx_status = -EIO;
}
CWARN("Abort reconnection of %s: %s\n",
libcfs_nid2str(peer_ni->ibp_nid), reason);
- kiblnd_txlist_done(&txs, -ECONNABORTED);
+ kiblnd_txlist_done(&txs, -ECONNABORTED,
+ LNET_MSG_STATUS_LOCAL_ABORTED);
return false;
}
if (tx != NULL) {
tx->tx_status = -EHOSTUNREACH;
tx->tx_waiting = 0;
+ tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
kiblnd_tx_done(tx);
}
return;
if (rc != 0) {
CERROR("Can't setup GET sink for %s: %d\n",
libcfs_nid2str(target.nid), rc);
+ tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
kiblnd_tx_done(tx);
return -EIO;
}
kiblnd_queue_tx(tx, rx->rx_conn);
return;
- failed_1:
+
+failed_1:
+ tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
kiblnd_tx_done(tx);
- failed_0:
+failed_0:
lnet_finalize(lntmsg, -EIO);
}
if (rc != 0) {
CERROR("Can't setup PUT sink for %s: %d\n",
libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+ tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
kiblnd_tx_done(tx);
/* tell peer_ni it's over */
kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK,
LASSERT(!tx->tx_queued);
LASSERT(tx->tx_waiting ||
tx->tx_sending != 0);
+ if (conn->ibc_comms_error == -ETIMEDOUT) {
+ if (tx->tx_waiting && !tx->tx_sending)
+ tx->tx_hstatus =
+ LNET_MSG_STATUS_REMOTE_TIMEOUT;
+ else if (tx->tx_sending)
+ tx->tx_hstatus =
+ LNET_MSG_STATUS_NETWORK_TIMEOUT;
+ }
} else {
LASSERT(tx->tx_queued);
+ if (conn->ibc_comms_error == -ETIMEDOUT)
+ tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_TIMEOUT;
+ else
+ tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
}
tx->tx_status = -ECONNABORTED;
tx->tx_waiting = 0;
+ /*
+ * TODO: This makes an assumption that
+ * kiblnd_tx_complete() will be called for each tx. If
+ * that event is dropped we could end up with stale
+ * connections floating around. We'd like to deal with
+ * that in a better way.
+ *
+ * Also that means we can exceed the timeout by many
+ * seconds.
+ */
if (tx->tx_sending == 0) {
tx->tx_queued = 0;
list_del(&tx->tx_list);
spin_unlock(&conn->ibc_lock);
- kiblnd_txlist_done(&zombies, -ECONNABORTED);
+ /*
+ * aborting transmits occurs when finalizing the connection.
+ * The connection is finalized on error
+ */
+ kiblnd_txlist_done(&zombies, -ECONNABORTED, -1);
}
static void
CNETERR("Deleting messages for %s: connection failed\n",
libcfs_nid2str(peer_ni->ibp_nid));
- kiblnd_txlist_done(&zombies, -EHOSTUNREACH);
+ kiblnd_txlist_done(&zombies, error,
+ LNET_MSG_STATUS_LOCAL_DROPPED);
}
static void
kiblnd_close_conn_locked(conn, -ECONNABORTED);
write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
- kiblnd_txlist_done(&txs, -ECONNABORTED);
+ kiblnd_txlist_done(&txs, -ECONNABORTED,
+ LNET_MSG_STATUS_LOCAL_ERROR);
return;
}
write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
if (!list_empty(&timedout_txs))
- kiblnd_txlist_done(&timedout_txs, -ETIMEDOUT);
+ kiblnd_txlist_done(&timedout_txs, -ETIMEDOUT,
+ LNET_MSG_STATUS_LOCAL_TIMEOUT);
/* Handle timeout by closing the whole
* connection. We can only be sure RDMA activity