From 08bbe9e562c403f247a74e99101d238398df6351 Mon Sep 17 00:00:00 2001 From: Chris Horn Date: Mon, 15 Aug 2022 15:06:25 -0600 Subject: [PATCH] LU-16213 kfilnd: Finalize replay TNs with deleted peer If there are transactions on the replay queue awaiting a hello response, and the peer is marked for removal (e.g. because the hello TN failed) then let's finalize those TNs right away rather than wait for them to hit the timeout. HPE-bug-id: LUS-11128 Test-Parameters: trivial Signed-off-by: Chris Horn Change-Id: I6dc77cadaf850ab9ec37bf50241074bc3f5650b5 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/48784 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Ian Ziemba Reviewed-by: Oleg Drokin Reviewed-by: Ron Gredvig --- lnet/klnds/kfilnd/kfilnd.h | 5 +++++ lnet/klnds/kfilnd/kfilnd_tn.c | 32 ++++++++++++++++++++------------ 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/lnet/klnds/kfilnd/kfilnd.h b/lnet/klnds/kfilnd/kfilnd.h index 076085d..c027d7a 100644 --- a/lnet/klnds/kfilnd/kfilnd.h +++ b/lnet/klnds/kfilnd/kfilnd.h @@ -238,6 +238,11 @@ struct kfilnd_peer { time64_t kp_hello_ts; }; +static inline bool kfilnd_peer_deleted(struct kfilnd_peer *kp) +{ + return atomic_read(&kp->kp_remove_peer) > 0; +} + /* Sets kp_hello_sending * Returns true if it was already set * Returns false otherwise diff --git a/lnet/klnds/kfilnd/kfilnd_tn.c b/lnet/klnds/kfilnd/kfilnd_tn.c index 36bc689..b9f94aa 100644 --- a/lnet/klnds/kfilnd/kfilnd_tn.c +++ b/lnet/klnds/kfilnd/kfilnd_tn.c @@ -614,7 +614,6 @@ static int kfilnd_tn_state_idle(struct kfilnd_transaction *tn, struct kfilnd_msg *msg; int rc; bool finalize = false; - ktime_t remaining_time; struct lnet_hdr hdr; struct lnet_nid srcnid; @@ -626,23 +625,32 @@ static int kfilnd_tn_state_idle(struct kfilnd_transaction *tn, */ if (kfilnd_peer_is_new_peer(tn->tn_kp) && (event == TN_EVENT_INIT_IMMEDIATE || event == TN_EVENT_INIT_BULK)) { - remaining_time = max_t(ktime_t, 0, - tn->deadline - ktime_get_seconds()); - - /* If transaction deadline has not be met, return -EAGAIN. This - * will cause this transaction event to be replayed. During this - * time, an async message from the peer should occur at which - * point the kfilnd version should be negotiated. - */ - if (remaining_time > 0) { + if (kfilnd_peer_deleted(tn->tn_kp)) { + /* We'll assign a NETWORK_TIMEOUT message health status + * below because we don't know why this peer was marked + * for removal + */ + rc = -ESTALE; + KFILND_TN_DEBUG(tn, + "Dropping message to stale peer %s\n", + libcfs_nid2str(tn->tn_kp->kp_nid)); + } else if (ktime_after(tn->deadline, ktime_get_seconds())) { + /* If transaction deadline has not been met, return + * -EAGAIN. This will cause this transaction event to be + * replayed. During this time, an async message from the + * peer should occur at which point the kfilnd version + * should be negotiated. + */ KFILND_TN_DEBUG(tn, "%s hello response pending", libcfs_nid2str(tn->tn_kp->kp_nid)); return -EAGAIN; + } else { + rc = -ETIMEDOUT; } - rc = 0; - kfilnd_tn_status_update(tn, -ETIMEDOUT, + kfilnd_tn_status_update(tn, rc, LNET_MSG_STATUS_NETWORK_TIMEOUT); + rc = 0; goto out; } -- 1.8.3.1