if (kfilnd_peer_needs_hello(tn->tn_kp)) {
rc = kfilnd_send_hello_request(dev, cpt, tn->tn_kp);
- if (rc) {
+ if (rc && kfilnd_peer_is_new_peer(tn->tn_kp)) {
+ /* Only fail the send if this is a new peer. Otherwise
+ * attempt the send using our stale peer information
+ */
kfilnd_tn_free(tn);
return rc;
}
struct kfilnd_immediate_buffer end_immed_bufs[];
};
+/* Newly allocated peer */
+#define KP_STATE_NEW 0x1
+/* Peer after successful hello handshake */
+#define KP_STATE_UPTODATE 0x2
+/* Peer experienced some sort of network failure */
+#define KP_STATE_STALE 0x3
+
struct kfilnd_peer {
struct rhash_head kp_node;
struct rcu_head kp_rcu_head;
u32 kp_remote_session_key;
atomic_t kp_hello_pending;
time64_t kp_hello_ts;
+ atomic_t kp_state;
};
static inline bool kfilnd_peer_deleted(struct kfilnd_peer *kp)
static inline bool kfilnd_peer_is_new_peer(struct kfilnd_peer *kp)
{
- return kp->kp_version == 0;
+ return atomic_read(&kp->kp_state) == KP_STATE_NEW;
}
/* Peer needs hello if it is not up to date and there is not already a hello
static inline bool kfilnd_peer_needs_hello(struct kfilnd_peer *kp)
{
if (atomic_read(&kp->kp_hello_pending) == 0) {
- if (kfilnd_peer_is_new_peer(kp))
+ if (atomic_read(&kp->kp_state) != KP_STATE_UPTODATE)
return true;
} else if (ktime_before(kp->kp_hello_ts + lnet_get_lnd_timeout(),
ktime_get_seconds())) {
}
/**
- * kfilnd_peer_down() - Mark a peer as down.
- * @kp: Peer to be downed.
+ * kfilnd_peer_stale() - Mark a peer as stale.
+ * @kp: Peer to be marked stale
+ * Note: only "up-to-date" peers can be marked stale. If we haven't completed
+ * a transaction with this peer within 5 LND timeouts then delete this peer.
*/
-void kfilnd_peer_down(struct kfilnd_peer *kp)
+void kfilnd_peer_stale(struct kfilnd_peer *kp)
+{
+ if (atomic_cmpxchg(&kp->kp_state,
+ KP_STATE_UPTODATE,
+ KP_STATE_STALE) == KP_STATE_UPTODATE) {
+ CDEBUG(D_NET, "%s(%p):0x%llx is stale\n",
+ libcfs_nid2str(kp->kp_nid), kp, kp->kp_addr);
+ } else if (ktime_before(kp->kp_last_alive + lnet_get_lnd_timeout() * 5,
+ ktime_get_seconds())) {
+ CDEBUG(D_NET,
+ "Haven't heard from %s(%p):0x%llx in %lld seconds\n",
+ libcfs_nid2str(kp->kp_nid), kp, kp->kp_addr,
+ ktime_sub(ktime_get_seconds(), kp->kp_last_alive));
+ kfilnd_peer_del(kp);
+ }
+}
+
+/**
+ * kfilnd_peer_del() - Mark a peer for deletion
+ * @kp: Peer to be deleted
+ */
+void kfilnd_peer_del(struct kfilnd_peer *kp)
{
if (atomic_cmpxchg(&kp->kp_remove_peer, 0, 1) == 0) {
struct lnet_nid peer_nid;
atomic_set(&kp->kp_rx_base, 0);
atomic_set(&kp->kp_remove_peer, 0);
atomic_set(&kp->kp_hello_pending, 0);
+ atomic_set(&kp->kp_state, KP_STATE_NEW);
kp->kp_local_session_key = kfilnd_dev_get_session_key(dev);
kp->kp_hello_ts = ktime_get_seconds();
msg->proto.hello.version);
}
+ atomic_set(&kp->kp_state, KP_STATE_UPTODATE);
+ CDEBUG(D_NET, "kp %s(%p):0x%llx is up-to-date\n",
+ libcfs_nid2str(kp->kp_nid), kp, kp->kp_addr);
+
/* Clear kp_hello_pending if we've received the hello response,
* otherwise this is an incoming hello request and we may have our
* own hello request to this peer still outstanding
#include "kfilnd.h"
-void kfilnd_peer_down(struct kfilnd_peer *kp);
+void kfilnd_peer_stale(struct kfilnd_peer *kp);
+void kfilnd_peer_del(struct kfilnd_peer *kp);
void kfilnd_peer_put(struct kfilnd_peer *kp);
struct kfilnd_peer *kfilnd_peer_get(struct kfilnd_dev *dev, lnet_nid_t nid);
void kfilnd_peer_alive(struct kfilnd_peer *kp);
case TN_EVENT_RX_HELLO:
msg = tn->tn_rx_msg.msg;
+ kfilnd_peer_alive(tn->tn_kp);
+
switch (msg->type) {
case KFILND_MSG_HELLO_REQ:
kfilnd_peer_process_hello(tn->tn_kp, msg);
hstatus = LNET_MSG_STATUS_REMOTE_ERROR;
kfilnd_tn_status_update(tn, status, hstatus);
- kfilnd_peer_down(tn->tn_kp);
+ kfilnd_peer_stale(tn->tn_kp);
if (tn->msg_type == KFILND_MSG_HELLO_REQ)
kfilnd_peer_clear_hello_pending(tn->tn_kp);
break;
hstatus = LNET_MSG_STATUS_REMOTE_ERROR;
kfilnd_tn_status_update(tn, status, hstatus);
- kfilnd_peer_down(tn->tn_kp);
+ kfilnd_peer_stale(tn->tn_kp);
/* Need to cancel the tagged receive to prevent resources from
* being leaked.
hstatus = LNET_MSG_STATUS_REMOTE_ERROR;
kfilnd_tn_status_update(tn, status, hstatus);
- kfilnd_peer_down(tn->tn_kp);
+ kfilnd_peer_stale(tn->tn_kp);
break;
default:
hstatus = LNET_MSG_STATUS_REMOTE_ERROR;
kfilnd_tn_status_update(tn, status, hstatus);
- kfilnd_peer_down(tn->tn_kp);
+ kfilnd_peer_stale(tn->tn_kp);
break;
case TN_EVENT_TAG_TX_OK:
switch (event) {
case TN_EVENT_TX_FAIL:
- kfilnd_peer_down(tn->tn_kp);
+ kfilnd_peer_stale(tn->tn_kp);
break;
case TN_EVENT_TX_OK:
case TN_EVENT_TAG_RX_CANCEL:
kfilnd_tn_status_update(tn, -ETIMEDOUT,
LNET_MSG_STATUS_REMOTE_TIMEOUT);
- kfilnd_peer_down(tn->tn_kp);
+ kfilnd_peer_stale(tn->tn_kp);
break;
case TN_EVENT_TAG_RX_FAIL: