LU-17855 lnet: Set peer NI down on lnet_notify

author Chris Horn <chris.horn@hpe.com>

Fri, 15 Mar 2024 18:52:07 +0000 (12:52 -0600)

committer Oleg Drokin <green@whamcloud.com>

Tue, 25 Jun 2024 03:30:58 +0000 (03:30 +0000)
author Chris Horn <chris.horn@hpe.com>
Fri, 15 Mar 2024 18:52:07 +0000 (12:52 -0600)
committer Oleg Drokin <green@whamcloud.com>
Tue, 25 Jun 2024 03:30:58 +0000 (03:30 +0000)
diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h

index 72820ee..74bf050 100644 (file)
--- a/lnet/include/lnet/lib-types.h
+++ b/lnet/include/lnet/lib-types.h
@@ -1395,6 +1395,14 @@ struct lnet_peer_ni {
         __u32                   lpni_sel_priority;
         /* number of preferred NIDs in lnpi_pref_nids */
         __u32                   lpni_pref_nnids;
+       /* Whether some thread is processing an lnet_notify() event for this
+        * peer NI
+        */
+       bool                    lpni_notifying;
+       /* Timestamp of the last lnet_notify() event for this peer NI */
+       time64_t                lpni_timestamp;
+       /* Whether we've received an lnet_notify() event for this peer NI */
+       bool                    lpni_notified;
  };
  
  /* Preferred path added due to traffic on non-MR peer_ni */
diff --git a/lnet/klnds/kfilnd/kfilnd_peer.c b/lnet/klnds/kfilnd/kfilnd_peer.c

index 8ad2dce..a7eed14 100644 (file)
--- a/lnet/klnds/kfilnd/kfilnd_peer.c
+++ b/lnet/klnds/kfilnd/kfilnd_peer.c
@@ -377,6 +377,8 @@ void kfilnd_peer_process_hello(struct kfilnd_peer *kp, struct kfilnd_msg *msg)
                         CDEBUG(D_NET, "Peer %s(%p):0x%llx new -> wait response\n",
                                libcfs_nid2str(kp->kp_nid), kp, kp->kp_addr);
         } else if (msg->type == KFILND_MSG_HELLO_RSP) {
+               struct lnet_nid nid;
+
                 kp->kp_version = msg->proto.hello.version;
                 atomic_set(&kp->kp_state, KP_STATE_UPTODATE);
                 CDEBUG(D_NET,
@@ -384,5 +386,9 @@ void kfilnd_peer_process_hello(struct kfilnd_peer *kp, struct kfilnd_msg *msg)
                        libcfs_nid2str(kp->kp_nid), kp, kp->kp_addr,
                        msg->proto.hello.version);
                 kfilnd_peer_clear_hello_state(kp);
+
+               lnet_nid4_to_nid(kp->kp_nid, &nid);
+               lnet_notify(kp->kp_dev->kfd_ni, &nid, true, false,
+                           kp->kp_last_alive);
         }
  }
diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c

index 64e0c92..40e9adb 100644 (file)
--- a/lnet/lnet/lib-move.c
+++ b/lnet/lnet/lib-move.c
@@ -768,36 +768,14 @@ lnet_check_message_drop(struct lnet_ni *ni, struct lnet_peer_ni *lpni,
         if (!lnet_peer_aliveness_enabled(lpni))
                 return 0;
  
-       /* If we're resending a message, let's attempt to send it even if
-        * the peer is down to fulfill our resend quota on the message
-        */
-       if (msg->msg_retry_count > 0)
-               return 0;
-
-       /* try and send recovery messages regardless */
-       if (msg->msg_recovery)
-               return 0;
-
-       /* always send any responses */
-       if (lnet_msg_is_response(msg))
-               return 0;
-
         /* always send non-routed messages */
         if (!msg->msg_routing)
                 return 0;
  
-       /* assume peer_ni is alive as long as we're within the configured
-        * peer timeout
-        */
-       if (ktime_get_seconds() <
-           (lpni->lpni_last_alive +
-            lpni->lpni_net->net_tunables.lct_peer_timeout))
+       if (lnet_is_peer_ni_alive(lpni))
                 return 0;
  
-       if (!lnet_is_peer_ni_alive(lpni))
-               return -EHOSTUNREACH;
-
-       return 0;
+       return -EHOSTUNREACH;
  }
  
  /**
@@ -4996,7 +4974,11 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr,
          * from it. The ping response reports back the ns_status which is
          * marked on the remote as up or down and we cache it here.
          */
-       msg->msg_rxpeer->lpni_ns_status = LNET_NI_STATUS_UP;
+       if (unlikely(msg->msg_rxpeer->lpni_ns_status != LNET_NI_STATUS_UP)) {
+               spin_lock(&msg->msg_rxpeer->lpni_lock);
+               msg->msg_rxpeer->lpni_ns_status = LNET_NI_STATUS_UP;
+               spin_unlock(&msg->msg_rxpeer->lpni_lock);
+       }
  
         lnet_msg_commit(msg, cpt);
  
diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c

index abe4512..76db4a3 100644 (file)
--- a/lnet/lnet/lib-msg.c
+++ b/lnet/lnet/lib-msg.c
@@ -826,15 +826,16 @@ lnet_health_check(struct lnet_msg *msg)
                 lnet_incr_hstats(ni, lpni, hstatus);
                 /* For remote failures, health/recovery/resends are not needed
                  * if the peer only has a single interface. Special case for
-                * routers where we rely on health feature to manage route
-                * aliveness. NB: lp_nnis does _not_ include the lolnd, so a
-                * single-rail node would have lp_nnis == 1.
+                * routers where we rely on health feature to manage route and
+                * peer aliveness. NB: unlike pb_nnis above, lp_nnis does _not_
+                * include the lolnd, so a single-rail node would have
+                * lp_nnis == 1.
                  */
                 if (lpni && lpni->lpni_peer_net &&
                     lpni->lpni_peer_net->lpn_peer &&
                     lpni->lpni_peer_net->lpn_peer->lp_nnis <= 1) {
                         attempt_remote_resend = false;
-                       if (!lnet_isrouter(lpni))
+                       if (!(lnet_isrouter(lpni) || the_lnet.ln_routing))
                                 handle_remote_health = false;
                 }
                 /* Do not put my interfaces into peer NI recovery. They should
diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c

index 9c3d4e1..398fb76 100644 (file)
--- a/lnet/lnet/peer.c
+++ b/lnet/lnet/peer.c
@@ -160,6 +160,7 @@ lnet_peer_ni_alloc(struct lnet_nid *nid)
         lpni->lpni_nid = *nid;
         lpni->lpni_cpt = cpt;
         atomic_set(&lpni->lpni_healthv, LNET_MAX_HEALTH_VALUE);
+       lpni->lpni_notified = false;
  
         net = lnet_get_net_locked(LNET_NID_NET(nid));
         lpni->lpni_net = net;
@@ -3189,15 +3190,25 @@ int ping_info_count_entries(struct lnet_ping_buffer *pbuf)
         return nnis;
  }
  
-static inline void handle_disc_lpni_health(struct lnet_peer_ni *lpni)
+static inline void handle_disc_lpni_health(struct lnet_peer_ni *lpni,
+                                          __u32 new_status)
  {
-       if (lpni->lpni_ns_status == LNET_NI_STATUS_DOWN) {
+       __u32 old_status;
+
+       spin_lock(&lpni->lpni_lock);
+       old_status = lpni->lpni_ns_status;
+       lpni->lpni_ns_status = new_status;
+       spin_unlock(&lpni->lpni_lock);
+
+       /* Decrement health when transitioning from UP to DOWN */
+       if (old_status != new_status && new_status == LNET_NI_STATUS_DOWN) {
                 lnet_net_lock(0);
                 lnet_handle_remote_failure_locked(lpni);
                 lnet_net_unlock(0);
-       } else if (lpni->lpni_ns_status == LNET_NI_STATUS_UP &&
-                !lpni->lpni_last_alive)
+       } else if (new_status == LNET_NI_STATUS_UP && !lpni->lpni_last_alive) {
+               /* Set health to max if the initial status is UP */
                 atomic_set(&lpni->lpni_healthv, LNET_MAX_HEALTH_VALUE);
+       }
  }
  
  /*
@@ -3239,7 +3250,6 @@ static int lnet_peer_merge_data(struct lnet_peer *lp,
         int i;
         int j;
         int rc;
-       __u32 old_st;
  
         flags = LNET_PEER_DISCOVERED;
         if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL)
@@ -3318,10 +3328,7 @@ static int lnet_peer_merge_data(struct lnet_peer *lp,
                                  */
                                 lpni = lnet_peer_ni_find_locked(&curnis[i]);
                                 if (lpni) {
-                                       old_st = lpni->lpni_ns_status;
-                                       lpni->lpni_ns_status = *stp;
-                                       if (old_st != lpni->lpni_ns_status)
-                                               handle_disc_lpni_health(lpni);
+                                       handle_disc_lpni_health(lpni, *stp);
                                         lnet_peer_ni_decref_locked(lpni);
                                 }
                                 break;
@@ -3351,8 +3358,7 @@ static int lnet_peer_merge_data(struct lnet_peer *lp,
                 }
                 lpni = lnet_peer_ni_find_locked(&addnis[i].ns_nid);
                 if (lpni) {
-                       lpni->lpni_ns_status = addnis[i].ns_status;
-                       handle_disc_lpni_health(lpni);
+                       handle_disc_lpni_health(lpni, addnis[i].ns_status);
                         lnet_peer_ni_decref_locked(lpni);
                 }
         }
diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c

index 6bc84b6..9b43823 100644 (file)
--- a/lnet/lnet/router.c
+++ b/lnet/lnet/router.c
@@ -1644,13 +1644,6 @@ lnet_rtrpools_disable(void)
         lnet_rtrpools_free(1);
  }
  
-static inline void
-lnet_notify_peer_down(struct lnet_ni *ni, struct lnet_nid *nid)
-{
-       if (ni->ni_net->net_lnd->lnd_notify_peer_down != NULL)
-               (ni->ni_net->net_lnd->lnd_notify_peer_down)(nid);
-}
-
  /*
   * ni: local NI used to communicate with the peer
   * nid: peer NID
@@ -1666,7 +1659,9 @@ lnet_notify(struct lnet_ni *ni, struct lnet_nid *nid, bool alive, bool reset,
         struct lnet_route *route;
         struct lnet_peer *lp;
         time64_t now = ktime_get_seconds();
-       int cpt;
+       int cpt = lnet_nid2cpt(nid, ni);
+       unsigned int ns_status = alive ? LNET_NI_STATUS_UP :
+                                        LNET_NI_STATUS_DOWN;
  
         LASSERT(!in_interrupt());
  
@@ -1690,45 +1685,49 @@ lnet_notify(struct lnet_ni *ni, struct lnet_nid *nid, bool alive, bool reset,
                 return -EINVAL;
         }
  
-       if (ni != NULL && !alive &&             /* LND telling me she's down */
-           !auto_down) {                       /* auto-down disabled */
+       if (ni && !alive &&             /* LND telling me she's down */
+           !auto_down) {               /* auto-down disabled */
                 CDEBUG(D_NET, "Auto-down disabled\n");
                 return 0;
         }
  
-       /* must lock 0 since this is used for synchronization */
-       lnet_net_lock(0);
+       lnet_net_lock(cpt);
  
         if (the_lnet.ln_state != LNET_STATE_RUNNING) {
-               lnet_net_unlock(0);
+               lnet_net_unlock(cpt);
                 return -ESHUTDOWN;
         }
  
         lpni = lnet_peer_ni_find_locked(nid);
-       if (lpni == NULL) {
+       if (!lpni) {
                 /* nid not found */
-               lnet_net_unlock(0);
+               lnet_net_unlock(cpt);
                 CDEBUG(D_NET, "%s not found\n", libcfs_nidstr(nid));
                 return 0;
         }
  
-       if (alive) {
-               if (reset) {
-                       lpni->lpni_ns_status = LNET_NI_STATUS_UP;
-                       lnet_set_lpni_healthv_locked(lpni,
-                                                    LNET_MAX_HEALTH_VALUE);
-               } else {
-                       lnet_inc_lpni_healthv_locked(lpni);
-               }
-       } else if (reset) {
-               lpni->lpni_ns_status = LNET_NI_STATUS_DOWN;
+       if (lpni->lpni_cpt != cpt) {
+               lnet_net_unlock(cpt);
+               cpt = lpni->lpni_cpt;
+               lnet_net_lock(cpt);
         }
  
-       /* recalculate aliveness */
-       alive = lnet_is_peer_ni_alive(lpni);
+       if (ni && !alive && when < lpni->lpni_last_alive)
+               when = lpni->lpni_last_alive;
+
+       if (lpni->lpni_timestamp > when) {
+               CDEBUG(D_NET, "Out of date\n");
+               goto out_lpni_decref;
+       }
+
+       lpni->lpni_timestamp = when;
  
         lp = lpni->lpni_peer_net->lpn_peer;
-       /* If this is an LNet router then update route aliveness */
+       /* If this peer NI belongs to an LNet router then update the associated
+        * route aliveness. We update before taking lpni_lock below to avoid
+        * holding both lpni_lock and lp_lock which is taken in
+        * lnet_is_discovery_disabled().
+        */
         if (lp->lp_rtr_refcount) {
                 if (reset)
                         /* reset flag indicates gateway peer went up or down */
@@ -1736,12 +1735,6 @@ lnet_notify(struct lnet_ni *ni, struct lnet_nid *nid, bool alive, bool reset,
  
                 /* If discovery is disabled, locally or on the gateway, then
                  * any routes using lpni as next-hop need to be updated
-                *
-                * NB: We can get many notifications while a route is down, so
-                * we try and avoid the expensive net_lock/EX here for the
-                * common case of receiving duplicate lnet_notify() calls (i.e.
-                * only grab EX lock when we actually need to update the route
-                * aliveness).
                  */
                 if (lnet_is_discovery_disabled(lp)) {
                         list_for_each_entry(route, &lp->lp_routes, lr_gwlist) {
@@ -1751,13 +1744,51 @@ lnet_notify(struct lnet_ni *ni, struct lnet_nid *nid, bool alive, bool reset,
                 }
         }
  
-       lnet_net_unlock(0);
+       spin_lock(&lpni->lpni_lock);
  
-       if (ni != NULL && !alive)
-               lnet_notify_peer_down(ni, &lpni->lpni_nid);
+       /* Depending on lnet_peers_start_down()/check_routers_before_use the
+        * lpni_ns_status may initialize to either UP or DOWN. Thus, the
+        * first notification that we receive may match the existing status.
+        * We do not want to ignore this notification.
+        */
+       if (lpni->lpni_notified && lpni->lpni_ns_status == ns_status) {
+               CDEBUG(D_NET, "Old news\n");
+               goto out_lpni_unlock;
+       }
  
-       cpt = lpni->lpni_cpt;
-       lnet_net_lock(cpt);
+       lpni->lpni_notified = true;
+       lpni->lpni_ns_status = ns_status;
+
+       while (lpni->lpni_notifying) {
+               /* Previous event is being processed */
+               spin_unlock(&lpni->lpni_lock);
+               lnet_net_unlock(cpt);
+               schedule();
+               lnet_net_lock(cpt);
+               spin_lock(&lpni->lpni_lock);
+       };
+
+       lpni->lpni_notifying = true;
+
+       if (alive && reset)
+               lnet_set_lpni_healthv_locked(lpni, LNET_MAX_HEALTH_VALUE);
+
+       if (lpni->lpni_ns_status == LNET_NI_STATUS_DOWN &&
+           ni && ni->ni_net->net_lnd->lnd_notify_peer_down) {
+               spin_unlock(&lpni->lpni_lock);
+               lnet_net_unlock(cpt);
+
+               (ni->ni_net->net_lnd->lnd_notify_peer_down)(&lpni->lpni_nid);
+
+               lnet_net_lock(cpt);
+               spin_lock(&lpni->lpni_lock);
+       }
+
+       lpni->lpni_notifying = false;
+
+out_lpni_unlock:
+       spin_unlock(&lpni->lpni_lock);
+out_lpni_decref:
         lnet_peer_ni_decref_locked(lpni);
         lnet_net_unlock(cpt);
  
diff --git a/lustre/tests/sanity-lnet.sh b/lustre/tests/sanity-lnet.sh

index e7a1e40..50dc795 100755 (executable)
--- a/lustre/tests/sanity-lnet.sh
+++ b/lustre/tests/sanity-lnet.sh
@@ -3403,6 +3403,41 @@ test_226() {
  }
  run_test 226 "test missing route for 1 of 2 routers"
  
+test_227() {
+       local opts="lnet_peer_discovery_disabled=1 lnet_health_sensitivity=0"
+       opts+=" lnet_transaction_timeout=10"
+
+       [[ $NETTYPE != kfi* ]] || skip "kfi doesn't support drop rules"
+
+       setup_router_test -p 2 $opts || return $?
+
+       do_basic_rtr_test || return $?
+
+       do_node ${RPEERS[0]} $LNETCTL lnet unconfigure ||
+               error "Failed to unconfigure lnet on ${RPEERS[0]}"
+
+       local rpeer_nids=( ${RPEER_NIDS[${RPEERS[0]}]} )
+
+       do_lnetctl ping ${rpeer_nids[0]} &&
+               error "Expected ping to fail"
+
+       do_lnetctl ping ${rpeer_nids[0]} &&
+               error "Expected ping to fail"
+
+       local dropped=$(do_node ${ROUTERS[0]} \
+                       $LNETCTL peer show -v 2 --nid ${rpeer_nids[0]} |
+                       grep -A 2 dropped_stats |
+                       awk '/get:/{print $2}' |
+                       xargs echo |
+                       sed 's/ /\+/g' | bc)
+
+       ((dropped > 0)) ||
+               error "Expected dropped > 0 found $dropped"
+
+       cleanup_router_test
+}
+run_test 227 "Check router peer health w/DD disabled"
+
  test_230() {
         [[ ${NETTYPE} == tcp* ]] || skip "Need tcp NETTYPE"
author	Chris Horn <chris.horn@hpe.com>
	Fri, 15 Mar 2024 18:52:07 +0000 (12:52 -0600)
committer	Oleg Drokin <green@whamcloud.com>
	Tue, 25 Jun 2024 03:30:58 +0000 (03:30 +0000)
lnet/include/lnet/lib-types.h		patch \| blob \| history
lnet/klnds/kfilnd/kfilnd_peer.c		patch \| blob \| history
lnet/lnet/lib-move.c		patch \| blob \| history
lnet/lnet/lib-msg.c		patch \| blob \| history
lnet/lnet/peer.c		patch \| blob \| history
lnet/lnet/router.c		patch \| blob \| history
lustre/tests/sanity-lnet.sh		patch \| blob \| history