Whamcloud - gitweb
LU-11299 lnet: Cleanup rcd 87/33187/35
authorAmir Shehata <ashehata@whamcloud.com>
Mon, 22 Oct 2018 22:09:11 +0000 (15:09 -0700)
committerAmir Shehata <ashehata@whamcloud.com>
Fri, 7 Jun 2019 18:15:38 +0000 (18:15 +0000)
Cleanup all code pertaining to rcd, as routing code will use
discovery going forward and there will be no need to keep its own
pinging code.

test_215 looks at the routers file which had its format changed.
Update the test to reflect the change.

Test-Parameters: forbuildonly
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: If31caa3b5703df40b6ae0f758f2fe764991aa4f3
Reviewed-on: https://review.whamcloud.com/33187
Reviewed-by: Olaf Weber <olaf.weber@hpe.com>
Reviewed-by: Sebastien Buisson <sbuisson@ddn.com>
Tested-by: Jenkins
lnet/include/lnet/lib-lnet.h
lnet/include/lnet/lib-types.h
lnet/lnet/api-ni.c
lnet/lnet/lib-move.c
lnet/lnet/router.c
lnet/lnet/router_proc.c
lustre/tests/sanity.sh

index 48a959c..6f6b146 100644 (file)
@@ -857,11 +857,7 @@ void lnet_monitor_thr_stop(void);
 
 bool lnet_router_checker_active(void);
 void lnet_check_routers(void);
-int lnet_router_pre_mt_start(void);
 void lnet_router_post_mt_start(void);
-void lnet_prune_rc_data(int wait_unlink);
-void lnet_router_cleanup(void);
-void lnet_router_ni_update_locked(struct lnet_peer_ni *gw, __u32 net);
 void lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf);
 
 int lnet_ping_info_validate(struct lnet_ping_info *pinfo);
index 0ab2d27..92aa993 100644 (file)
@@ -527,16 +527,6 @@ struct lnet_ping_buffer {
 #define LNET_PING_INFO_TO_BUFFER(PINFO)        \
        container_of((PINFO), struct lnet_ping_buffer, pb_info)
 
-/* router checker data, per router */
-struct lnet_rc_data {
-       /* chain on the_lnet.ln_zombie_rcd or ln_deathrow_rcd */
-       struct list_head        rcd_list;
-       struct lnet_handle_md   rcd_mdh;        /* ping buffer MD */
-       struct lnet_peer_ni     *rcd_gateway;   /* reference to gateway */
-       struct lnet_ping_buffer *rcd_pingbuffer;/* ping buffer */
-       int                     rcd_nnis;       /* desired size of buffer */
-};
-
 struct lnet_peer_ni {
        /* chain on lpn_peer_nis */
        struct list_head        lpni_peer_nis;
@@ -569,20 +559,6 @@ struct lnet_peer_ni {
        int                     lpni_minrtrcredits;
        /* bytes queued for sending */
        long                    lpni_txqnob;
-       /* notification outstanding? */
-       bool                    lpni_notify;
-       /* outstanding notification for LND? */
-       bool                    lpni_notifylnd;
-       /* some thread is handling notification */
-       bool                    lpni_notifying;
-       /* # times router went dead<->alive. Protected with lpni_lock */
-       int                     lpni_alive_count;
-       /* time of last aliveness news */
-       time64_t                lpni_timestamp;
-       /* when I was last alive */
-       time64_t                lpni_last_alive;
-       /* when lpni_ni was queried last time */
-       time64_t                lpni_last_query;
        /* network peer is on */
        struct lnet_net         *lpni_net;
        /* peer's NID */
@@ -612,8 +588,6 @@ struct lnet_peer_ni {
        } lpni_pref;
        /* number of preferred NIDs in lnpi_pref_nids */
        __u32                   lpni_pref_nnids;
-       /* router checker state */
-       struct lnet_rc_data     *lpni_rcd;
 };
 
 /* Preferred path added due to traffic on non-MR peer_ni */
@@ -821,7 +795,6 @@ struct lnet_route {
        __u32                   lr_net;         /* remote network number */
        __u32                   lr_lnet;        /* local network number */
        int                     lr_seq;         /* sequence for round-robin */
-       unsigned int            lr_downis;      /* number of down NIs */
        __u32                   lr_hops;        /* how far I am */
        unsigned int            lr_priority;    /* route priority */
 };
@@ -1094,12 +1067,6 @@ struct lnet {
 
        /* monitor thread startup/shutdown state */
        int                             ln_mt_state;
-       /* router checker's event queue */
-       struct lnet_handle_eq           ln_rc_eqh;
-       /* rcd still pending on net */
-       struct list_head                ln_rcd_deathrow;
-       /* rcd ready for free */
-       struct list_head                ln_rcd_zombie;
        /* serialise startup/shutdown */
        struct semaphore                ln_mt_signal;
 
index 99ccfd7..1c6a931 100644 (file)
@@ -1536,6 +1536,28 @@ lnet_get_ni_count(void)
        return count;
 }
 
+void
+lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf)
+{
+       struct lnet_ni_status *stat;
+       int nnis;
+       int i;
+
+       __swab32s(&pbuf->pb_info.pi_magic);
+       __swab32s(&pbuf->pb_info.pi_features);
+       __swab32s(&pbuf->pb_info.pi_pid);
+       __swab32s(&pbuf->pb_info.pi_nnis);
+       nnis = pbuf->pb_info.pi_nnis;
+       if (nnis > pbuf->pb_nnis)
+               nnis = pbuf->pb_nnis;
+       for (i = 0; i < nnis; i++) {
+               stat = &pbuf->pb_info.pi_ni[i];
+               __swab64s(&stat->ns_nid);
+               __swab32s(&stat->ns_status);
+       }
+       return;
+}
+
 int
 lnet_ping_info_validate(struct lnet_ping_info *pinfo)
 {
@@ -2454,12 +2476,9 @@ int lnet_lib_init(void)
        }
 
        the_lnet.ln_refcount = 0;
-       LNetInvalidateEQHandle(&the_lnet.ln_rc_eqh);
        INIT_LIST_HEAD(&the_lnet.ln_lnds);
        INIT_LIST_HEAD(&the_lnet.ln_net_zombie);
-       INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie);
        INIT_LIST_HEAD(&the_lnet.ln_msg_resend);
-       INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow);
 
        /* The hash table size is the number of bits it takes to express the set
         * ln_num_routes, minus 1 (better to under estimate than over so we
index 0ef94aa..f3f47ff 100644 (file)
@@ -3407,9 +3407,6 @@ lnet_monitor_thread(void *arg)
                                                cfs_time_seconds(interval));
        }
 
-       /* clean up the router checker */
-       lnet_prune_rc_data(1);
-
        /* Shutting down */
        lnet_net_lock(LNET_LOCK_EX);
        the_lnet.ln_mt_state = LNET_MT_STATE_SHUTDOWN;
@@ -3622,11 +3619,6 @@ int lnet_monitor_thr_start(void)
        if (rc)
                goto clean_queues;
 
-       /* Pre monitor thread start processing */
-       rc = lnet_router_pre_mt_start();
-       if (rc)
-               goto free_mem;
-
        sema_init(&the_lnet.ln_mt_signal, 0);
 
        lnet_net_lock(LNET_LOCK_EX);
@@ -3651,8 +3643,6 @@ clean_thread:
        /* block until event callback signals exit */
        down(&the_lnet.ln_mt_signal);
        /* clean up */
-       lnet_router_cleanup();
-free_mem:
        lnet_net_lock(LNET_LOCK_EX);
        the_lnet.ln_mt_state = LNET_MT_STATE_SHUTDOWN;
        lnet_net_unlock(LNET_LOCK_EX);
@@ -3688,7 +3678,6 @@ void lnet_monitor_thr_stop(void)
        LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN);
 
        /* perform cleanup tasks */
-       lnet_router_cleanup();
        lnet_rsp_tracker_clean();
        lnet_clean_local_ni_recoveryq();
        lnet_clean_peer_ni_recoveryq();
index 37bc811..1ceae7b 100644 (file)
@@ -230,99 +230,6 @@ bool lnet_is_route_alive(struct lnet_route *route)
        return route_alive;
 }
 
-void
-lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive,
-                  time64_t when)
-{
-       if (lp->lpni_timestamp > when) { /* out of date information */
-               CDEBUG(D_NET, "Out of date\n");
-               return;
-       }
-
-       /*
-        * This function can be called with different cpt locks being
-        * held. lpni_alive_count modification needs to be properly protected.
-        * Significant reads to lpni_alive_count are also protected with
-        * the same lock
-        */
-       spin_lock(&lp->lpni_lock);
-
-       lp->lpni_timestamp = when; /* update timestamp */
-
-       /* got old news */
-       if (lp->lpni_alive_count != 0 &&
-           /* new date for old news */
-           (!lnet_is_peer_ni_alive(lp)) == (!alive)) {
-               spin_unlock(&lp->lpni_lock);
-               CDEBUG(D_NET, "Old news\n");
-               return;
-       }
-
-       /* Flag that notification is outstanding */
-
-       lp->lpni_alive_count++;
-       lp->lpni_notify = 1;
-       lp->lpni_notifylnd = notifylnd;
-       if (lnet_is_peer_ni_alive(lp))
-               lp->lpni_ping_feats = LNET_PING_FEAT_INVAL; /* reset */
-
-       spin_unlock(&lp->lpni_lock);
-
-       CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lpni_nid), alive);
-}
-
-/*
- * This function will always be called with lp->lpni_cpt lock held.
- */
-static void
-lnet_ni_notify_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp)
-{
-       int alive;
-       int notifylnd;
-
-       /* Notify only in 1 thread at any time to ensure ordered notification.
-        * NB individual events can be missed; the only guarantee is that you
-        * always get the most recent news */
-
-       spin_lock(&lp->lpni_lock);
-
-       if (lp->lpni_notifying || ni == NULL) {
-               spin_unlock(&lp->lpni_lock);
-               return;
-       }
-
-       lp->lpni_notifying = 1;
-
-       /*
-        * lp->lpni_notify needs to be protected because it can be set in
-        * lnet_notify_locked().
-        */
-       while (lp->lpni_notify) {
-               alive     = lnet_is_peer_ni_alive(lp);
-               notifylnd = lp->lpni_notifylnd;
-
-               lp->lpni_notifylnd = 0;
-               lp->lpni_notify    = 0;
-
-               if (notifylnd && ni->ni_net->net_lnd->lnd_notify != NULL) {
-                       spin_unlock(&lp->lpni_lock);
-                       lnet_net_unlock(lp->lpni_cpt);
-
-                       /* A new notification could happen now; I'll handle it
-                        * when control returns to me */
-
-                       (ni->ni_net->net_lnd->lnd_notify)(ni, lp->lpni_nid,
-                                                         alive);
-
-                       lnet_net_lock(lp->lpni_cpt);
-                       spin_lock(&lp->lpni_lock);
-               }
-       }
-
-       lp->lpni_notifying = 0;
-       spin_unlock(&lp->lpni_lock);
-}
-
 static void
 lnet_rtr_addref_locked(struct lnet_peer *lp)
 {
@@ -744,89 +651,6 @@ lnet_get_route(int idx, __u32 *net, __u32 *hops,
        return -ENOENT;
 }
 
-void
-lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf)
-{
-       struct lnet_ni_status *stat;
-       int nnis;
-       int i;
-
-       __swab32s(&pbuf->pb_info.pi_magic);
-       __swab32s(&pbuf->pb_info.pi_features);
-       __swab32s(&pbuf->pb_info.pi_pid);
-       __swab32s(&pbuf->pb_info.pi_nnis);
-       nnis = pbuf->pb_info.pi_nnis;
-       if (nnis > pbuf->pb_nnis)
-               nnis = pbuf->pb_nnis;
-       for (i = 0; i < nnis; i++) {
-               stat = &pbuf->pb_info.pi_ni[i];
-               __swab64s(&stat->ns_nid);
-               __swab32s(&stat->ns_status);
-       }
-       return;
-}
-
-/**
- * TODO: re-implement
- */
-static void
-lnet_parse_rc_info(struct lnet_rc_data *rcd)
-{
-       rcd = rcd;
-}
-
-static void
-lnet_router_checker_event(struct lnet_event *event)
-{
-       struct lnet_rc_data *rcd = event->md.user_ptr;
-       struct lnet_peer_ni *lp;
-
-       LASSERT(rcd != NULL);
-
-       if (event->unlinked) {
-               LNetInvalidateMDHandle(&rcd->rcd_mdh);
-               return;
-       }
-
-       LASSERT(event->type == LNET_EVENT_SEND ||
-               event->type == LNET_EVENT_REPLY);
-
-       lp = rcd->rcd_gateway;
-       LASSERT(lp != NULL);
-
-        /* NB: it's called with holding lnet_res_lock, we have a few
-         * places need to hold both locks at the same time, please take
-         * care of lock ordering */
-       lnet_net_lock(lp->lpni_cpt);
-       if (!lnet_isrouter(lp) || lp->lpni_rcd != rcd) {
-               /* ignore if no longer a router or rcd is replaced */
-               goto out;
-       }
-
-       if (event->type == LNET_EVENT_SEND) {
-               if (event->status == 0)
-                       goto out;
-       }
-
-       /* LNET_EVENT_REPLY */
-       /* A successful REPLY means the router is up.  If _any_ comms
-        * to the router fail I assume it's down (this will happen if
-        * we ping alive routers to try to detect router death before
-        * apps get burned). */
-
-       lnet_notify_locked(lp, 1, !event->status, ktime_get_seconds());
-       /* The router checker will wake up very shortly and do the
-        * actual notification.
-        * XXX If 'lp' stops being a router before then, it will still
-        * have the notification pending!!! */
-
-       if (avoid_asym_router_failure && event->status == 0)
-               lnet_parse_rc_info(rcd);
-
- out:
-       lnet_net_unlock(lp->lpni_cpt);
-}
-
 static void
 lnet_wait_known_routerstate(void)
 {
@@ -864,26 +688,6 @@ lnet_wait_known_routerstate(void)
        }
 }
 
-/* TODO: reimplement */
-void
-lnet_router_ni_update_locked(struct lnet_peer_ni *gw, __u32 net)
-{
-       struct lnet_route *rte;
-       struct lnet_peer *lp;
-
-       if ((gw->lpni_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0)
-               lp = gw->lpni_peer_net->lpn_peer;
-       else
-               return;
-
-       list_for_each_entry(rte, &lp->lp_routes, lr_gwlist) {
-               if (rte->lr_net == net) {
-                       rte->lr_downis = 0;
-                       break;
-               }
-       }
-}
-
 static void
 lnet_update_ni_status_locked(void)
 {
@@ -924,27 +728,6 @@ lnet_update_ni_status_locked(void)
        }
 }
 
-int lnet_router_pre_mt_start(void)
-{
-       int rc;
-
-       if (check_routers_before_use &&
-           dead_router_check_interval <= 0) {
-               LCONSOLE_ERROR_MSG(0x10a, "'dead_router_check_interval' must be"
-                                  " set if 'check_routers_before_use' is set"
-                                  "\n");
-               return -EINVAL;
-       }
-
-       rc = LNetEQAlloc(0, lnet_router_checker_event, &the_lnet.ln_rc_eqh);
-       if (rc != 0) {
-               CERROR("Can't allocate EQ(0): %d\n", rc);
-               return -ENOMEM;
-       }
-
-       return 0;
-}
-
 void lnet_router_post_mt_start(void)
 {
        if (check_routers_before_use) {
@@ -955,22 +738,6 @@ void lnet_router_post_mt_start(void)
        }
 }
 
-void
-lnet_router_cleanup(void)
-{
-       int rc;
-
-       rc = LNetEQFree(the_lnet.ln_rc_eqh);
-       LASSERT(rc == 0);
-       return;
-}
-
-void
-lnet_prune_rc_data(int wait_unlink)
-{
-       wait_unlink = wait_unlink;
-}
-
 /*
  * This function is called from the monitor thread to check if there are
  * any active routers that need to be checked.
@@ -986,11 +753,6 @@ lnet_router_checker_active(void)
        if (the_lnet.ln_routing)
                return true;
 
-       /* if there are routers that need to be cleaned up then do so */
-       if (!list_empty(&the_lnet.ln_rcd_deathrow) ||
-           !list_empty(&the_lnet.ln_rcd_zombie))
-               return true;
-
        return !list_empty(&the_lnet.ln_routers) &&
                (live_router_check_interval > 0 ||
                 dead_router_check_interval > 0);
@@ -1025,8 +787,6 @@ rescan:
                lnet_update_ni_status_locked();
 
        lnet_net_unlock(cpt);
-
-       lnet_prune_rc_data(0); /* don't wait for UNLINK */
 }
 
 void
@@ -1519,18 +1279,6 @@ lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, int alive, time64_t when)
                lnet_net_lock(cpt);
        }
 
-       /* We can't fully trust LND on reporting exact peer last_alive
-        * if he notifies us about dead peer. For example ksocklnd can
-        * call us with when == _time_when_the_node_was_booted_ if
-        * no connections were successfully established */
-       if (ni != NULL && !alive && when < lp->lpni_last_alive)
-               when = lp->lpni_last_alive;
-
-       lnet_notify_locked(lp, ni == NULL, alive, when);
-
-       if (ni != NULL)
-               lnet_ni_notify_locked(ni, lp);
-
        lnet_peer_ni_decref_locked(lp);
 
        lnet_net_unlock(cpt);
index aaa12cc..5627e40 100644 (file)
@@ -224,18 +224,18 @@ proc_lnet_routes(struct ctl_table *table, int write, void __user *buffer,
                }
 
                if (route != NULL) {
-                       __u32        net        = rnet->lrn_net;
-                       __u32 hops              = route->lr_hops;
-                       unsigned int priority   = route->lr_priority;
-                       lnet_nid_t   nid        = route->lr_gateway->lp_primary_nid;
-                       int          alive      = lnet_is_route_alive(route);
+                       __u32 net = rnet->lrn_net;
+                       __u32 hops = route->lr_hops;
+                       unsigned int priority = route->lr_priority;
+                       int alive = lnet_is_route_alive(route);
 
                        s += snprintf(s, tmpstr + tmpsiz - s,
                                      "%-8s %4d %8u %7s %s\n",
                                      libcfs_net2str(net), hops,
                                      priority,
                                      alive ? "up" : "down",
-                                     libcfs_nid2str(nid));
+                                     /* TODO: replace with actual nid */
+                                     libcfs_nid2str(LNET_NID_ANY));
                        LASSERT(tmpstr + tmpsiz - s > 0);
                }
 
@@ -291,10 +291,8 @@ proc_lnet_routers(struct ctl_table *table, int write, void __user *buffer,
 
        if (*ppos == 0) {
                s += snprintf(s, tmpstr + tmpsiz - s,
-                             "%-4s %7s %9s %6s %12s %9s %8s %7s %s\n",
-                             "ref", "rtr_ref", "alive_cnt", "state",
-                             "last_ping", "ping_sent", "deadline",
-                             "down_ni", "router");
+                             "%-4s %7s %5s %s\n",
+                             "ref", "rtr_ref", "alive", "router");
                LASSERT(tmpstr + tmpsiz - s > 0);
 
                lnet_net_lock(0);
@@ -333,47 +331,15 @@ proc_lnet_routers(struct ctl_table *table, int write, void __user *buffer,
 
                if (peer != NULL) {
                        lnet_nid_t nid = peer->lp_primary_nid;
-                       time64_t now = ktime_get_seconds();
-                       /* TODO: readjust what's being printed */
-                       time64_t deadline = 0;
                        int nrefs     = atomic_read(&peer->lp_refcount);
                        int nrtrrefs  = peer->lp_rtr_refcount;
-                       int alive_cnt = 0;
                        int alive     = lnet_is_gateway_alive(peer);
-                       int pingsent  = ((peer->lp_state & LNET_PEER_PING_SENT)
-                                        != 0);
-                       time64_t last_ping = now - peer->lp_rtrcheck_timestamp;
-                       int down_ni   = 0;
-                       struct lnet_route *rtr;
-
-                       if (nrtrrefs > 0) {
-                               list_for_each_entry(rtr, &peer->lp_routes,
-                                                   lr_gwlist) {
-                                       /* downis on any route should be the
-                                        * number of downis on the gateway */
-                                       if (rtr->lr_downis != 0) {
-                                               down_ni = rtr->lr_downis;
-                                               break;
-                                       }
-                               }
-                       }
 
-                       if (deadline == 0)
-                               s += snprintf(s, tmpstr + tmpsiz - s,
-                                             "%-4d %7d %9d %6s %12llu %9d %8s %7d %s\n",
-                                             nrefs, nrtrrefs, alive_cnt,
-                                             alive ? "up" : "down", last_ping,
-                                             pingsent, "NA", down_ni,
-                                             libcfs_nid2str(nid));
-                       else
-                               s += snprintf(s, tmpstr + tmpsiz - s,
-                                             "%-4d %7d %9d %6s %12llu %9d %8llu %7d %s\n",
-                                             nrefs, nrtrrefs, alive_cnt,
-                                             alive ? "up" : "down", last_ping,
-                                             pingsent,
-                                             deadline - now,
-                                             down_ni, libcfs_nid2str(nid));
-                       LASSERT(tmpstr + tmpsiz - s > 0);
+                       s += snprintf(s, tmpstr + tmpsiz - s,
+                                     "%-4d %7d %5s %s\n",
+                                     nrefs, nrtrrefs,
+                                     alive ? "up" : "down",
+                                     libcfs_nid2str(nid));
                }
 
                lnet_net_unlock(0);
@@ -539,19 +505,6 @@ proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer,
                                aliveness = lnet_is_peer_ni_alive(peer) ?
                                        "up" : "down";
 
-                       if (lnet_peer_aliveness_enabled(peer)) {
-                               time64_t now = ktime_get_seconds();
-
-                               lastalive = now - peer->lpni_last_alive;
-
-                               /* No need to mess up peers contents with
-                                * arbitrarily long integers - it suffices to
-                                * know that lastalive is more than 10000s old
-                                */
-                               if (lastalive >= 10000)
-                                       lastalive = 9999;
-                       }
-
                        lnet_net_unlock(cpt);
 
                        s += snprintf(s, tmpstr + tmpsiz - s,
index cbdb6bb..df88859 100755 (executable)
@@ -14759,8 +14759,8 @@ test_215() { # for bugs 18102, 21079, 21517
        # where ref > 0, rtr_ref > 0, alive_cnt >= 0, state is up/down,
        # last_ping >= 0, ping_sent is boolean (0/1), deadline and down_ni are
        # numeric (0 or >0 or <0), router is a string like 192.168.1.1@tcp2
-       L1="^ref +rtr_ref +alive_cnt +state +last_ping +ping_sent +deadline +down_ni +router$"
-       BR="^$P +$P +$N +(up|down) +$N +(0|1) +$I +$I +$NID$"
+       L1="^ref +rtr_ref +alive +router$"
+       BR="^$P +$P +(up|down) +$NID$"
        create_lnet_proc_files "routers"
        check_lnet_proc_entry "routers.sys" "lnet.routers" "$BR" "$L1"
        remove_lnet_proc_files "routers"