Whamcloud - gitweb
LU-17855 lnet: Set peer NI down on lnet_notify 42/55342/3
authorChris Horn <chris.horn@hpe.com>
Fri, 15 Mar 2024 18:52:07 +0000 (12:52 -0600)
committerOleg Drokin <green@whamcloud.com>
Tue, 25 Jun 2024 03:30:58 +0000 (03:30 +0000)
The LNet router peer health feature is intended to allow LNet routers
drop messages for peer NIs that it considers down/unreachable so that
resources can be freed to forward messages to peer NIs that are
up/reachable.

This feature was integrated with the LNet health feature under
LU-11300, and, as a result, routers only consider a peer NI
down/unreachable if two criteria are met:
1. The router hasn't received a message from the peer NI within the
LND's "peer_timeout" value (default 180 seconds).
2. The health value of the peer NI has been decremented or the cached
peer NI status is LNET_NI_STATUS_DOWN.

(1) is problematic because a lot of messages can be queued to a down
peer while we wait for the peer_timeout to expire. This can
introduce latency for messages being forwarded to other peers.

(2) is problematic because there are some cases where LNet health
will not be decremented (namely single-rail peers), and the cached
peer NI status can only be set to LNET_NI_STATUS_DOWN if the router
receives a discovery push from the peer. If the peer loses all
connectivity to the router then it is possible the router will never
consider it down.

To address the problems with (1) the requirement is dropped
completely.

To address the problems with (2), LNet routers will now decrement
health values of single-rail peers and lnet_notify() is modified to
set the peer NI status UP/DOWN according to the aliveness information
provided by the LND.

HPE-bug-id: LUS-12209
Test-Parameters: trivial testlist=sanity-lnet
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: I7823cc7ae73bcb0b6b52db8d4f84cff7b999d8c0
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/55342
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Frank Sehr <fsehr@whamcloud.com>
Reviewed-by: Alexander Boyko <alexander.boyko@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/include/lnet/lib-types.h
lnet/klnds/kfilnd/kfilnd_peer.c
lnet/lnet/lib-move.c
lnet/lnet/lib-msg.c
lnet/lnet/peer.c
lnet/lnet/router.c
lustre/tests/sanity-lnet.sh

index 72820ee..74bf050 100644 (file)
@@ -1395,6 +1395,14 @@ struct lnet_peer_ni {
        __u32                   lpni_sel_priority;
        /* number of preferred NIDs in lnpi_pref_nids */
        __u32                   lpni_pref_nnids;
+       /* Whether some thread is processing an lnet_notify() event for this
+        * peer NI
+        */
+       bool                    lpni_notifying;
+       /* Timestamp of the last lnet_notify() event for this peer NI */
+       time64_t                lpni_timestamp;
+       /* Whether we've received an lnet_notify() event for this peer NI */
+       bool                    lpni_notified;
 };
 
 /* Preferred path added due to traffic on non-MR peer_ni */
index 8ad2dce..a7eed14 100644 (file)
@@ -377,6 +377,8 @@ void kfilnd_peer_process_hello(struct kfilnd_peer *kp, struct kfilnd_msg *msg)
                        CDEBUG(D_NET, "Peer %s(%p):0x%llx new -> wait response\n",
                               libcfs_nid2str(kp->kp_nid), kp, kp->kp_addr);
        } else if (msg->type == KFILND_MSG_HELLO_RSP) {
+               struct lnet_nid nid;
+
                kp->kp_version = msg->proto.hello.version;
                atomic_set(&kp->kp_state, KP_STATE_UPTODATE);
                CDEBUG(D_NET,
@@ -384,5 +386,9 @@ void kfilnd_peer_process_hello(struct kfilnd_peer *kp, struct kfilnd_msg *msg)
                       libcfs_nid2str(kp->kp_nid), kp, kp->kp_addr,
                       msg->proto.hello.version);
                kfilnd_peer_clear_hello_state(kp);
+
+               lnet_nid4_to_nid(kp->kp_nid, &nid);
+               lnet_notify(kp->kp_dev->kfd_ni, &nid, true, false,
+                           kp->kp_last_alive);
        }
 }
index 64e0c92..40e9adb 100644 (file)
@@ -768,36 +768,14 @@ lnet_check_message_drop(struct lnet_ni *ni, struct lnet_peer_ni *lpni,
        if (!lnet_peer_aliveness_enabled(lpni))
                return 0;
 
-       /* If we're resending a message, let's attempt to send it even if
-        * the peer is down to fulfill our resend quota on the message
-        */
-       if (msg->msg_retry_count > 0)
-               return 0;
-
-       /* try and send recovery messages regardless */
-       if (msg->msg_recovery)
-               return 0;
-
-       /* always send any responses */
-       if (lnet_msg_is_response(msg))
-               return 0;
-
        /* always send non-routed messages */
        if (!msg->msg_routing)
                return 0;
 
-       /* assume peer_ni is alive as long as we're within the configured
-        * peer timeout
-        */
-       if (ktime_get_seconds() <
-           (lpni->lpni_last_alive +
-            lpni->lpni_net->net_tunables.lct_peer_timeout))
+       if (lnet_is_peer_ni_alive(lpni))
                return 0;
 
-       if (!lnet_is_peer_ni_alive(lpni))
-               return -EHOSTUNREACH;
-
-       return 0;
+       return -EHOSTUNREACH;
 }
 
 /**
@@ -4996,7 +4974,11 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr,
         * from it. The ping response reports back the ns_status which is
         * marked on the remote as up or down and we cache it here.
         */
-       msg->msg_rxpeer->lpni_ns_status = LNET_NI_STATUS_UP;
+       if (unlikely(msg->msg_rxpeer->lpni_ns_status != LNET_NI_STATUS_UP)) {
+               spin_lock(&msg->msg_rxpeer->lpni_lock);
+               msg->msg_rxpeer->lpni_ns_status = LNET_NI_STATUS_UP;
+               spin_unlock(&msg->msg_rxpeer->lpni_lock);
+       }
 
        lnet_msg_commit(msg, cpt);
 
index abe4512..76db4a3 100644 (file)
@@ -826,15 +826,16 @@ lnet_health_check(struct lnet_msg *msg)
                lnet_incr_hstats(ni, lpni, hstatus);
                /* For remote failures, health/recovery/resends are not needed
                 * if the peer only has a single interface. Special case for
-                * routers where we rely on health feature to manage route
-                * aliveness. NB: lp_nnis does _not_ include the lolnd, so a
-                * single-rail node would have lp_nnis == 1.
+                * routers where we rely on health feature to manage route and
+                * peer aliveness. NB: unlike pb_nnis above, lp_nnis does _not_
+                * include the lolnd, so a single-rail node would have
+                * lp_nnis == 1.
                 */
                if (lpni && lpni->lpni_peer_net &&
                    lpni->lpni_peer_net->lpn_peer &&
                    lpni->lpni_peer_net->lpn_peer->lp_nnis <= 1) {
                        attempt_remote_resend = false;
-                       if (!lnet_isrouter(lpni))
+                       if (!(lnet_isrouter(lpni) || the_lnet.ln_routing))
                                handle_remote_health = false;
                }
                /* Do not put my interfaces into peer NI recovery. They should
index 9c3d4e1..398fb76 100644 (file)
@@ -160,6 +160,7 @@ lnet_peer_ni_alloc(struct lnet_nid *nid)
        lpni->lpni_nid = *nid;
        lpni->lpni_cpt = cpt;
        atomic_set(&lpni->lpni_healthv, LNET_MAX_HEALTH_VALUE);
+       lpni->lpni_notified = false;
 
        net = lnet_get_net_locked(LNET_NID_NET(nid));
        lpni->lpni_net = net;
@@ -3189,15 +3190,25 @@ int ping_info_count_entries(struct lnet_ping_buffer *pbuf)
        return nnis;
 }
 
-static inline void handle_disc_lpni_health(struct lnet_peer_ni *lpni)
+static inline void handle_disc_lpni_health(struct lnet_peer_ni *lpni,
+                                          __u32 new_status)
 {
-       if (lpni->lpni_ns_status == LNET_NI_STATUS_DOWN) {
+       __u32 old_status;
+
+       spin_lock(&lpni->lpni_lock);
+       old_status = lpni->lpni_ns_status;
+       lpni->lpni_ns_status = new_status;
+       spin_unlock(&lpni->lpni_lock);
+
+       /* Decrement health when transitioning from UP to DOWN */
+       if (old_status != new_status && new_status == LNET_NI_STATUS_DOWN) {
                lnet_net_lock(0);
                lnet_handle_remote_failure_locked(lpni);
                lnet_net_unlock(0);
-       } else if (lpni->lpni_ns_status == LNET_NI_STATUS_UP &&
-                !lpni->lpni_last_alive)
+       } else if (new_status == LNET_NI_STATUS_UP && !lpni->lpni_last_alive) {
+               /* Set health to max if the initial status is UP */
                atomic_set(&lpni->lpni_healthv, LNET_MAX_HEALTH_VALUE);
+       }
 }
 
 /*
@@ -3239,7 +3250,6 @@ static int lnet_peer_merge_data(struct lnet_peer *lp,
        int i;
        int j;
        int rc;
-       __u32 old_st;
 
        flags = LNET_PEER_DISCOVERED;
        if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL)
@@ -3318,10 +3328,7 @@ static int lnet_peer_merge_data(struct lnet_peer *lp,
                                 */
                                lpni = lnet_peer_ni_find_locked(&curnis[i]);
                                if (lpni) {
-                                       old_st = lpni->lpni_ns_status;
-                                       lpni->lpni_ns_status = *stp;
-                                       if (old_st != lpni->lpni_ns_status)
-                                               handle_disc_lpni_health(lpni);
+                                       handle_disc_lpni_health(lpni, *stp);
                                        lnet_peer_ni_decref_locked(lpni);
                                }
                                break;
@@ -3351,8 +3358,7 @@ static int lnet_peer_merge_data(struct lnet_peer *lp,
                }
                lpni = lnet_peer_ni_find_locked(&addnis[i].ns_nid);
                if (lpni) {
-                       lpni->lpni_ns_status = addnis[i].ns_status;
-                       handle_disc_lpni_health(lpni);
+                       handle_disc_lpni_health(lpni, addnis[i].ns_status);
                        lnet_peer_ni_decref_locked(lpni);
                }
        }
index 6bc84b6..9b43823 100644 (file)
@@ -1644,13 +1644,6 @@ lnet_rtrpools_disable(void)
        lnet_rtrpools_free(1);
 }
 
-static inline void
-lnet_notify_peer_down(struct lnet_ni *ni, struct lnet_nid *nid)
-{
-       if (ni->ni_net->net_lnd->lnd_notify_peer_down != NULL)
-               (ni->ni_net->net_lnd->lnd_notify_peer_down)(nid);
-}
-
 /*
  * ni: local NI used to communicate with the peer
  * nid: peer NID
@@ -1666,7 +1659,9 @@ lnet_notify(struct lnet_ni *ni, struct lnet_nid *nid, bool alive, bool reset,
        struct lnet_route *route;
        struct lnet_peer *lp;
        time64_t now = ktime_get_seconds();
-       int cpt;
+       int cpt = lnet_nid2cpt(nid, ni);
+       unsigned int ns_status = alive ? LNET_NI_STATUS_UP :
+                                        LNET_NI_STATUS_DOWN;
 
        LASSERT(!in_interrupt());
 
@@ -1690,45 +1685,49 @@ lnet_notify(struct lnet_ni *ni, struct lnet_nid *nid, bool alive, bool reset,
                return -EINVAL;
        }
 
-       if (ni != NULL && !alive &&             /* LND telling me she's down */
-           !auto_down) {                       /* auto-down disabled */
+       if (ni && !alive &&             /* LND telling me she's down */
+           !auto_down) {               /* auto-down disabled */
                CDEBUG(D_NET, "Auto-down disabled\n");
                return 0;
        }
 
-       /* must lock 0 since this is used for synchronization */
-       lnet_net_lock(0);
+       lnet_net_lock(cpt);
 
        if (the_lnet.ln_state != LNET_STATE_RUNNING) {
-               lnet_net_unlock(0);
+               lnet_net_unlock(cpt);
                return -ESHUTDOWN;
        }
 
        lpni = lnet_peer_ni_find_locked(nid);
-       if (lpni == NULL) {
+       if (!lpni) {
                /* nid not found */
-               lnet_net_unlock(0);
+               lnet_net_unlock(cpt);
                CDEBUG(D_NET, "%s not found\n", libcfs_nidstr(nid));
                return 0;
        }
 
-       if (alive) {
-               if (reset) {
-                       lpni->lpni_ns_status = LNET_NI_STATUS_UP;
-                       lnet_set_lpni_healthv_locked(lpni,
-                                                    LNET_MAX_HEALTH_VALUE);
-               } else {
-                       lnet_inc_lpni_healthv_locked(lpni);
-               }
-       } else if (reset) {
-               lpni->lpni_ns_status = LNET_NI_STATUS_DOWN;
+       if (lpni->lpni_cpt != cpt) {
+               lnet_net_unlock(cpt);
+               cpt = lpni->lpni_cpt;
+               lnet_net_lock(cpt);
        }
 
-       /* recalculate aliveness */
-       alive = lnet_is_peer_ni_alive(lpni);
+       if (ni && !alive && when < lpni->lpni_last_alive)
+               when = lpni->lpni_last_alive;
+
+       if (lpni->lpni_timestamp > when) {
+               CDEBUG(D_NET, "Out of date\n");
+               goto out_lpni_decref;
+       }
+
+       lpni->lpni_timestamp = when;
 
        lp = lpni->lpni_peer_net->lpn_peer;
-       /* If this is an LNet router then update route aliveness */
+       /* If this peer NI belongs to an LNet router then update the associated
+        * route aliveness. We update before taking lpni_lock below to avoid
+        * holding both lpni_lock and lp_lock which is taken in
+        * lnet_is_discovery_disabled().
+        */
        if (lp->lp_rtr_refcount) {
                if (reset)
                        /* reset flag indicates gateway peer went up or down */
@@ -1736,12 +1735,6 @@ lnet_notify(struct lnet_ni *ni, struct lnet_nid *nid, bool alive, bool reset,
 
                /* If discovery is disabled, locally or on the gateway, then
                 * any routes using lpni as next-hop need to be updated
-                *
-                * NB: We can get many notifications while a route is down, so
-                * we try and avoid the expensive net_lock/EX here for the
-                * common case of receiving duplicate lnet_notify() calls (i.e.
-                * only grab EX lock when we actually need to update the route
-                * aliveness).
                 */
                if (lnet_is_discovery_disabled(lp)) {
                        list_for_each_entry(route, &lp->lp_routes, lr_gwlist) {
@@ -1751,13 +1744,51 @@ lnet_notify(struct lnet_ni *ni, struct lnet_nid *nid, bool alive, bool reset,
                }
        }
 
-       lnet_net_unlock(0);
+       spin_lock(&lpni->lpni_lock);
 
-       if (ni != NULL && !alive)
-               lnet_notify_peer_down(ni, &lpni->lpni_nid);
+       /* Depending on lnet_peers_start_down()/check_routers_before_use the
+        * lpni_ns_status may initialize to either UP or DOWN. Thus, the
+        * first notification that we receive may match the existing status.
+        * We do not want to ignore this notification.
+        */
+       if (lpni->lpni_notified && lpni->lpni_ns_status == ns_status) {
+               CDEBUG(D_NET, "Old news\n");
+               goto out_lpni_unlock;
+       }
 
-       cpt = lpni->lpni_cpt;
-       lnet_net_lock(cpt);
+       lpni->lpni_notified = true;
+       lpni->lpni_ns_status = ns_status;
+
+       while (lpni->lpni_notifying) {
+               /* Previous event is being processed */
+               spin_unlock(&lpni->lpni_lock);
+               lnet_net_unlock(cpt);
+               schedule();
+               lnet_net_lock(cpt);
+               spin_lock(&lpni->lpni_lock);
+       };
+
+       lpni->lpni_notifying = true;
+
+       if (alive && reset)
+               lnet_set_lpni_healthv_locked(lpni, LNET_MAX_HEALTH_VALUE);
+
+       if (lpni->lpni_ns_status == LNET_NI_STATUS_DOWN &&
+           ni && ni->ni_net->net_lnd->lnd_notify_peer_down) {
+               spin_unlock(&lpni->lpni_lock);
+               lnet_net_unlock(cpt);
+
+               (ni->ni_net->net_lnd->lnd_notify_peer_down)(&lpni->lpni_nid);
+
+               lnet_net_lock(cpt);
+               spin_lock(&lpni->lpni_lock);
+       }
+
+       lpni->lpni_notifying = false;
+
+out_lpni_unlock:
+       spin_unlock(&lpni->lpni_lock);
+out_lpni_decref:
        lnet_peer_ni_decref_locked(lpni);
        lnet_net_unlock(cpt);
 
index e7a1e40..50dc795 100755 (executable)
@@ -3403,6 +3403,41 @@ test_226() {
 }
 run_test 226 "test missing route for 1 of 2 routers"
 
+test_227() {
+       local opts="lnet_peer_discovery_disabled=1 lnet_health_sensitivity=0"
+       opts+=" lnet_transaction_timeout=10"
+
+       [[ $NETTYPE != kfi* ]] || skip "kfi doesn't support drop rules"
+
+       setup_router_test -p 2 $opts || return $?
+
+       do_basic_rtr_test || return $?
+
+       do_node ${RPEERS[0]} $LNETCTL lnet unconfigure ||
+               error "Failed to unconfigure lnet on ${RPEERS[0]}"
+
+       local rpeer_nids=( ${RPEER_NIDS[${RPEERS[0]}]} )
+
+       do_lnetctl ping ${rpeer_nids[0]} &&
+               error "Expected ping to fail"
+
+       do_lnetctl ping ${rpeer_nids[0]} &&
+               error "Expected ping to fail"
+
+       local dropped=$(do_node ${ROUTERS[0]} \
+                       $LNETCTL peer show -v 2 --nid ${rpeer_nids[0]} |
+                       grep -A 2 dropped_stats |
+                       awk '/get:/{print $2}' |
+                       xargs echo |
+                       sed 's/ /\+/g' | bc)
+
+       ((dropped > 0)) ||
+               error "Expected dropped > 0 found $dropped"
+
+       cleanup_router_test
+}
+run_test 227 "Check router peer health w/DD disabled"
+
 test_230() {
        [[ ${NETTYPE} == tcp* ]] || skip "Need tcp NETTYPE"