Whamcloud - gitweb
LU-11297 lnet: handle router health off 34/33634/31
authorAmir Shehata <ashehata@whamcloud.com>
Fri, 9 Nov 2018 18:31:27 +0000 (10:31 -0800)
committerAmir Shehata <ashehata@whamcloud.com>
Fri, 7 Jun 2019 18:20:09 +0000 (18:20 +0000)
Routing infrastructure depends on health infrastructure to manage
route status. However, health can be turned off. Therefore, we need
to enable health for gateways in order to monitor them properly.
Each peer now has its own health sensitivity. When adding a route
the gateway's health sensitivity can be explicitly set from lnetctl
or if not specified then it'll default to 1, thereby turning health
on for that gateway, allowing peer NI recovery if there is a failure.

Test-Parameters: forbuildonly
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: Ibae33d595e97d0eec432ae8f5d51898ce0776f01
Reviewed-on: https://review.whamcloud.com/33634
Reviewed-by: Olaf Weber <olaf.weber@hpe.com>
Tested-by: Jenkins
lnet/include/lnet/lib-lnet.h
lnet/include/lnet/lib-types.h
lnet/include/uapi/linux/lnet/lnet-dlc.h
lnet/lnet/api-ni.c
lnet/lnet/config.c
lnet/lnet/lib-msg.c
lnet/lnet/peer.c
lnet/lnet/router.c

index 1910c1e..a1ad5a7 100644 (file)
@@ -577,11 +577,12 @@ int lnet_notify(struct lnet_ni *ni, lnet_nid_t peer, bool alive, bool reset,
 void lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive,
                        time64_t when);
 int lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway_nid,
-                  unsigned int priority);
+                  __u32 priority, __u32 sensitivity);
 int lnet_del_route(__u32 net, lnet_nid_t gw_nid);
 void lnet_destroy_routes(void);
 int lnet_get_route(int idx, __u32 *net, __u32 *hops,
-                  lnet_nid_t *gateway, __u32 *alive, __u32 *priority);
+                  lnet_nid_t *gateway, __u32 *alive, __u32 *priority,
+                  __u32 *sensitivity);
 int lnet_get_rtr_pool_cfg(int idx, struct lnet_ioctl_pool_cfg *pool_cfg);
 struct lnet_ni *lnet_get_next_ni_locked(struct lnet_net *mynet,
                                        struct lnet_ni *prev);
index c83d537..6121592 100644 (file)
@@ -624,6 +624,12 @@ struct lnet_peer {
        /* # refs from lnet_route_t::lr_gateway */
        int                     lp_rtr_refcount;
 
+       /*
+        * peer specific health sensitivity value to decrement peer nis in
+        * this peer with if set to something other than 0
+        */
+       __u32                   lp_health_sensitivity;
+
        /* messages blocking for router credits */
        struct list_head        lp_rtrq;
 
index f10cbc3..a454a65 100644 (file)
@@ -139,6 +139,7 @@ struct lnet_ioctl_config_data {
                        __u32 rtr_hop;
                        __u32 rtr_priority;
                        __u32 rtr_flags;
+                       __u32 rtr_sensitivity;
                } cfg_route;
                struct {
                        char net_intf[LNET_MAX_STR_LEN];
index 91a128a..56a6f9c 100644 (file)
@@ -3533,20 +3533,28 @@ LNetCtl(unsigned int cmd, void *arg)
        case IOC_LIBCFS_FAIL_NID:
                return lnet_fail_nid(data->ioc_nid, data->ioc_count);
 
-       case IOC_LIBCFS_ADD_ROUTE:
+       case IOC_LIBCFS_ADD_ROUTE: {
+               /* default router sensitivity to 1 */
+               unsigned int sensitivity = 1;
                config = arg;
 
                if (config->cfg_hdr.ioc_len < sizeof(*config))
                        return -EINVAL;
 
+               if (config->cfg_config_u.cfg_route.rtr_sensitivity) {
+                       sensitivity =
+                         config->cfg_config_u.cfg_route.rtr_sensitivity;
+               }
+
                mutex_lock(&the_lnet.ln_api_mutex);
                rc = lnet_add_route(config->cfg_net,
                                    config->cfg_config_u.cfg_route.rtr_hop,
                                    config->cfg_nid,
                                    config->cfg_config_u.cfg_route.
-                                       rtr_priority);
+                                       rtr_priority, sensitivity);
                mutex_unlock(&the_lnet.ln_api_mutex);
                return rc;
+       }
 
        case IOC_LIBCFS_DEL_ROUTE:
                config = arg;
@@ -3572,7 +3580,9 @@ LNetCtl(unsigned int cmd, void *arg)
                                    &config->cfg_nid,
                                    &config->cfg_config_u.cfg_route.rtr_flags,
                                    &config->cfg_config_u.cfg_route.
-                                       rtr_priority);
+                                       rtr_priority,
+                                   &config->cfg_config_u.cfg_route.
+                                       rtr_sensitivity);
                mutex_unlock(&the_lnet.ln_api_mutex);
                return rc;
 
index a8ff2e3..75e6e39 100644 (file)
@@ -1246,7 +1246,7 @@ lnet_parse_route (char *str, int *im_a_router)
                                continue;
                        }
 
-                       rc = lnet_add_route(net, hops, nid, priority);
+                       rc = lnet_add_route(net, hops, nid, priority, 1);
                        if (rc != 0 && rc != -EEXIST && rc != -EHOSTUNREACH) {
                                CERROR("Can't create route "
                                       "to %s via %s\n",
index 6541811..445d5a2 100644 (file)
@@ -440,14 +440,14 @@ lnet_complete_msg_locked(struct lnet_msg *msg, int cpt)
 }
 
 static void
-lnet_dec_healthv_locked(atomic_t *healthv)
+lnet_dec_healthv_locked(atomic_t *healthv, int sensitivity)
 {
        int h = atomic_read(healthv);
 
-       if (h < lnet_health_sensitivity) {
+       if (h < sensitivity) {
                atomic_set(healthv, 0);
        } else {
-               h -= lnet_health_sensitivity;
+               h -= sensitivity;
                atomic_set(healthv, h);
        }
 }
@@ -466,7 +466,7 @@ lnet_handle_local_failure(struct lnet_ni *local_ni)
                return;
        }
 
-       lnet_dec_healthv_locked(&local_ni->ni_healthv);
+       lnet_dec_healthv_locked(&local_ni->ni_healthv, lnet_health_sensitivity);
        /*
         * add the NI to the recovery queue if it's not already there
         * and it's health value is actually below the maximum. It's
@@ -489,11 +489,22 @@ lnet_handle_local_failure(struct lnet_ni *local_ni)
 void
 lnet_handle_remote_failure_locked(struct lnet_peer_ni *lpni)
 {
+       __u32 sensitivity = lnet_health_sensitivity;
+       __u32 lp_sensitivity;
+
        /* lpni could be NULL if we're in the LOLND case */
        if (!lpni)
                return;
 
-       lnet_dec_healthv_locked(&lpni->lpni_healthv);
+       /*
+        * If there is a health sensitivity in the peer then use that
+        * instead of the globally set one.
+        */
+       lp_sensitivity = lpni->lpni_peer_net->lpn_peer->lp_health_sensitivity;
+       if (lp_sensitivity)
+               sensitivity = lp_sensitivity;
+
+       lnet_dec_healthv_locked(&lpni->lpni_healthv, sensitivity);
        /*
         * add the peer NI to the recovery queue if it's not already there
         * and it's health value is actually below the maximum. It's
index f3e675e..c7dee63 100644 (file)
@@ -258,6 +258,14 @@ lnet_peer_alloc(lnet_nid_t nid)
        init_waitqueue_head(&lp->lp_dc_waitq);
        spin_lock_init(&lp->lp_lock);
        lp->lp_primary_nid = nid;
+
+       /*
+        * all peers created on a router should have health on
+        * if it's not already on.
+        */
+       if (the_lnet.ln_routing && !lnet_health_sensitivity)
+               lp->lp_health_sensitivity = 1;
+
        /*
         * Turn off discovery for loopback peer. If you're creating a peer
         * for the loopback interface then that was initiated when we
index 0346cf0..fb7f905 100644 (file)
@@ -425,7 +425,7 @@ lnet_add_route_to_rnet(struct lnet_remotenet *rnet, struct lnet_route *route)
 
 int
 lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
-              unsigned int priority)
+              __u32 priority, __u32 sensitivity)
 {
        struct list_head *route_entry;
        struct lnet_remotenet *rnet;
@@ -528,8 +528,10 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
         * to move the routes from the peer that's being deleted to the
         * consolidated peer lp_routes list
         */
-       if (add_route)
+       if (add_route) {
+               gw->lp_health_sensitivity = sensitivity;
                lnet_add_route_to_rnet(rnet2, route);
+       }
 
        /*
         * get rid of the reference on the lpni.
@@ -698,15 +700,15 @@ int lnet_get_rtr_pool_cfg(int cpt, struct lnet_ioctl_pool_cfg *pool_cfg)
 
 int
 lnet_get_route(int idx, __u32 *net, __u32 *hops,
-              lnet_nid_t *gateway, __u32 *alive, __u32 *priority)
+              lnet_nid_t *gateway, __u32 *alive, __u32 *priority, __u32 *sensitivity)
 {
-       struct list_head *e1;
-       struct list_head *e2;
        struct lnet_remotenet *rnet;
-       struct lnet_route        *route;
-       int               cpt;
-       int               i;
        struct list_head *rn_list;
+       struct lnet_route *route;
+       struct list_head *e1;
+       struct list_head *e2;
+       int cpt;
+       int i;
 
        cpt = lnet_net_lock_current();
 
@@ -724,6 +726,8 @@ lnet_get_route(int idx, __u32 *net, __u32 *hops,
                                        *gateway  = route->lr_nid;
                                        *hops     = route->lr_hops;
                                        *priority = route->lr_priority;
+                                       *sensitivity = route->lr_gateway->
+                                               lp_health_sensitivity;
                                        *alive    = lnet_is_route_alive(route);
                                        lnet_net_unlock(cpt);
                                        return 0;