Whamcloud - gitweb
LU-16213 kfilnd: Finalize replay TNs with deleted peer 84/48784/4
authorChris Horn <chris.horn@hpe.com>
Mon, 15 Aug 2022 21:06:25 +0000 (15:06 -0600)
committerOleg Drokin <green@whamcloud.com>
Thu, 19 Jan 2023 15:30:24 +0000 (15:30 +0000)
If there are transactions on the replay queue awaiting a hello
response, and the peer is marked for removal (e.g. because the hello
TN failed) then let's finalize those TNs right away rather than wait
for them to hit the timeout.

HPE-bug-id: LUS-11128
Test-Parameters: trivial
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: I6dc77cadaf850ab9ec37bf50241074bc3f5650b5
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/48784
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Ian Ziemba <ian.ziemba@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Reviewed-by: Ron Gredvig <ron.gredvig@hpe.com>
lnet/klnds/kfilnd/kfilnd.h
lnet/klnds/kfilnd/kfilnd_tn.c

index 076085d..c027d7a 100644 (file)
@@ -238,6 +238,11 @@ struct kfilnd_peer {
        time64_t kp_hello_ts;
 };
 
+static inline bool kfilnd_peer_deleted(struct kfilnd_peer *kp)
+{
+       return atomic_read(&kp->kp_remove_peer) > 0;
+}
+
 /* Sets kp_hello_sending
  * Returns true if it was already set
  * Returns false otherwise
index 36bc689..b9f94aa 100644 (file)
@@ -614,7 +614,6 @@ static int kfilnd_tn_state_idle(struct kfilnd_transaction *tn,
        struct kfilnd_msg *msg;
        int rc;
        bool finalize = false;
-       ktime_t remaining_time;
        struct lnet_hdr hdr;
        struct lnet_nid srcnid;
 
@@ -626,23 +625,32 @@ static int kfilnd_tn_state_idle(struct kfilnd_transaction *tn,
         */
        if (kfilnd_peer_is_new_peer(tn->tn_kp) &&
            (event == TN_EVENT_INIT_IMMEDIATE || event == TN_EVENT_INIT_BULK)) {
-               remaining_time = max_t(ktime_t, 0,
-                                      tn->deadline - ktime_get_seconds());
-
-               /* If transaction deadline has not be met, return -EAGAIN. This
-                * will cause this transaction event to be replayed. During this
-                * time, an async message from the peer should occur at which
-                * point the kfilnd version should be negotiated.
-                */
-               if (remaining_time > 0) {
+               if (kfilnd_peer_deleted(tn->tn_kp)) {
+                       /* We'll assign a NETWORK_TIMEOUT message health status
+                        * below because we don't know why this peer was marked
+                        * for removal
+                        */
+                       rc = -ESTALE;
+                       KFILND_TN_DEBUG(tn,
+                                       "Dropping message to stale peer %s\n",
+                                       libcfs_nid2str(tn->tn_kp->kp_nid));
+               } else if (ktime_after(tn->deadline, ktime_get_seconds())) {
+                       /* If transaction deadline has not been met, return
+                        * -EAGAIN. This will cause this transaction event to be
+                        * replayed. During this time, an async message from the
+                        * peer should occur at which point the kfilnd version
+                        * should be negotiated.
+                        */
                        KFILND_TN_DEBUG(tn, "%s hello response pending",
                                        libcfs_nid2str(tn->tn_kp->kp_nid));
                        return -EAGAIN;
+               } else {
+                       rc = -ETIMEDOUT;
                }
 
-               rc = 0;
-               kfilnd_tn_status_update(tn, -ETIMEDOUT,
+               kfilnd_tn_status_update(tn, rc,
                                        LNET_MSG_STATUS_NETWORK_TIMEOUT);
+               rc = 0;
                goto out;
        }