From: Amir Shehata Date: Mon, 22 Oct 2018 22:09:11 +0000 (-0700) Subject: LU-11299 lnet: Cleanup rcd X-Git-Tag: 2.12.55~25^2~17 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;ds=sidebyside;h=9ee453928ab854f43016624578b95c76e73a6a47;hp=f8c7dd6f53748cf589b2a1f18d93b92761f9d983;p=fs%2Flustre-release.git LU-11299 lnet: Cleanup rcd Cleanup all code pertaining to rcd, as routing code will use discovery going forward and there will be no need to keep its own pinging code. test_215 looks at the routers file which had its format changed. Update the test to reflect the change. Test-Parameters: forbuildonly Signed-off-by: Amir Shehata Change-Id: If31caa3b5703df40b6ae0f758f2fe764991aa4f3 Reviewed-on: https://review.whamcloud.com/33187 Reviewed-by: Olaf Weber Reviewed-by: Sebastien Buisson Tested-by: Jenkins --- diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index 48a959c..6f6b146 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -857,11 +857,7 @@ void lnet_monitor_thr_stop(void); bool lnet_router_checker_active(void); void lnet_check_routers(void); -int lnet_router_pre_mt_start(void); void lnet_router_post_mt_start(void); -void lnet_prune_rc_data(int wait_unlink); -void lnet_router_cleanup(void); -void lnet_router_ni_update_locked(struct lnet_peer_ni *gw, __u32 net); void lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf); int lnet_ping_info_validate(struct lnet_ping_info *pinfo); diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 0ab2d27..92aa993 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -527,16 +527,6 @@ struct lnet_ping_buffer { #define LNET_PING_INFO_TO_BUFFER(PINFO) \ container_of((PINFO), struct lnet_ping_buffer, pb_info) -/* router checker data, per router */ -struct lnet_rc_data { - /* chain on the_lnet.ln_zombie_rcd or ln_deathrow_rcd */ - struct list_head rcd_list; - struct lnet_handle_md rcd_mdh; /* ping buffer MD */ - struct lnet_peer_ni *rcd_gateway; /* reference to gateway */ - struct lnet_ping_buffer *rcd_pingbuffer;/* ping buffer */ - int rcd_nnis; /* desired size of buffer */ -}; - struct lnet_peer_ni { /* chain on lpn_peer_nis */ struct list_head lpni_peer_nis; @@ -569,20 +559,6 @@ struct lnet_peer_ni { int lpni_minrtrcredits; /* bytes queued for sending */ long lpni_txqnob; - /* notification outstanding? */ - bool lpni_notify; - /* outstanding notification for LND? */ - bool lpni_notifylnd; - /* some thread is handling notification */ - bool lpni_notifying; - /* # times router went dead<->alive. Protected with lpni_lock */ - int lpni_alive_count; - /* time of last aliveness news */ - time64_t lpni_timestamp; - /* when I was last alive */ - time64_t lpni_last_alive; - /* when lpni_ni was queried last time */ - time64_t lpni_last_query; /* network peer is on */ struct lnet_net *lpni_net; /* peer's NID */ @@ -612,8 +588,6 @@ struct lnet_peer_ni { } lpni_pref; /* number of preferred NIDs in lnpi_pref_nids */ __u32 lpni_pref_nnids; - /* router checker state */ - struct lnet_rc_data *lpni_rcd; }; /* Preferred path added due to traffic on non-MR peer_ni */ @@ -821,7 +795,6 @@ struct lnet_route { __u32 lr_net; /* remote network number */ __u32 lr_lnet; /* local network number */ int lr_seq; /* sequence for round-robin */ - unsigned int lr_downis; /* number of down NIs */ __u32 lr_hops; /* how far I am */ unsigned int lr_priority; /* route priority */ }; @@ -1094,12 +1067,6 @@ struct lnet { /* monitor thread startup/shutdown state */ int ln_mt_state; - /* router checker's event queue */ - struct lnet_handle_eq ln_rc_eqh; - /* rcd still pending on net */ - struct list_head ln_rcd_deathrow; - /* rcd ready for free */ - struct list_head ln_rcd_zombie; /* serialise startup/shutdown */ struct semaphore ln_mt_signal; diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 99ccfd7..1c6a931 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -1536,6 +1536,28 @@ lnet_get_ni_count(void) return count; } +void +lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf) +{ + struct lnet_ni_status *stat; + int nnis; + int i; + + __swab32s(&pbuf->pb_info.pi_magic); + __swab32s(&pbuf->pb_info.pi_features); + __swab32s(&pbuf->pb_info.pi_pid); + __swab32s(&pbuf->pb_info.pi_nnis); + nnis = pbuf->pb_info.pi_nnis; + if (nnis > pbuf->pb_nnis) + nnis = pbuf->pb_nnis; + for (i = 0; i < nnis; i++) { + stat = &pbuf->pb_info.pi_ni[i]; + __swab64s(&stat->ns_nid); + __swab32s(&stat->ns_status); + } + return; +} + int lnet_ping_info_validate(struct lnet_ping_info *pinfo) { @@ -2454,12 +2476,9 @@ int lnet_lib_init(void) } the_lnet.ln_refcount = 0; - LNetInvalidateEQHandle(&the_lnet.ln_rc_eqh); INIT_LIST_HEAD(&the_lnet.ln_lnds); INIT_LIST_HEAD(&the_lnet.ln_net_zombie); - INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie); INIT_LIST_HEAD(&the_lnet.ln_msg_resend); - INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow); /* The hash table size is the number of bits it takes to express the set * ln_num_routes, minus 1 (better to under estimate than over so we diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 0ef94aa..f3f47ff 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -3407,9 +3407,6 @@ lnet_monitor_thread(void *arg) cfs_time_seconds(interval)); } - /* clean up the router checker */ - lnet_prune_rc_data(1); - /* Shutting down */ lnet_net_lock(LNET_LOCK_EX); the_lnet.ln_mt_state = LNET_MT_STATE_SHUTDOWN; @@ -3622,11 +3619,6 @@ int lnet_monitor_thr_start(void) if (rc) goto clean_queues; - /* Pre monitor thread start processing */ - rc = lnet_router_pre_mt_start(); - if (rc) - goto free_mem; - sema_init(&the_lnet.ln_mt_signal, 0); lnet_net_lock(LNET_LOCK_EX); @@ -3651,8 +3643,6 @@ clean_thread: /* block until event callback signals exit */ down(&the_lnet.ln_mt_signal); /* clean up */ - lnet_router_cleanup(); -free_mem: lnet_net_lock(LNET_LOCK_EX); the_lnet.ln_mt_state = LNET_MT_STATE_SHUTDOWN; lnet_net_unlock(LNET_LOCK_EX); @@ -3688,7 +3678,6 @@ void lnet_monitor_thr_stop(void) LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN); /* perform cleanup tasks */ - lnet_router_cleanup(); lnet_rsp_tracker_clean(); lnet_clean_local_ni_recoveryq(); lnet_clean_peer_ni_recoveryq(); diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index 37bc811..1ceae7b 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -230,99 +230,6 @@ bool lnet_is_route_alive(struct lnet_route *route) return route_alive; } -void -lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive, - time64_t when) -{ - if (lp->lpni_timestamp > when) { /* out of date information */ - CDEBUG(D_NET, "Out of date\n"); - return; - } - - /* - * This function can be called with different cpt locks being - * held. lpni_alive_count modification needs to be properly protected. - * Significant reads to lpni_alive_count are also protected with - * the same lock - */ - spin_lock(&lp->lpni_lock); - - lp->lpni_timestamp = when; /* update timestamp */ - - /* got old news */ - if (lp->lpni_alive_count != 0 && - /* new date for old news */ - (!lnet_is_peer_ni_alive(lp)) == (!alive)) { - spin_unlock(&lp->lpni_lock); - CDEBUG(D_NET, "Old news\n"); - return; - } - - /* Flag that notification is outstanding */ - - lp->lpni_alive_count++; - lp->lpni_notify = 1; - lp->lpni_notifylnd = notifylnd; - if (lnet_is_peer_ni_alive(lp)) - lp->lpni_ping_feats = LNET_PING_FEAT_INVAL; /* reset */ - - spin_unlock(&lp->lpni_lock); - - CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lpni_nid), alive); -} - -/* - * This function will always be called with lp->lpni_cpt lock held. - */ -static void -lnet_ni_notify_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp) -{ - int alive; - int notifylnd; - - /* Notify only in 1 thread at any time to ensure ordered notification. - * NB individual events can be missed; the only guarantee is that you - * always get the most recent news */ - - spin_lock(&lp->lpni_lock); - - if (lp->lpni_notifying || ni == NULL) { - spin_unlock(&lp->lpni_lock); - return; - } - - lp->lpni_notifying = 1; - - /* - * lp->lpni_notify needs to be protected because it can be set in - * lnet_notify_locked(). - */ - while (lp->lpni_notify) { - alive = lnet_is_peer_ni_alive(lp); - notifylnd = lp->lpni_notifylnd; - - lp->lpni_notifylnd = 0; - lp->lpni_notify = 0; - - if (notifylnd && ni->ni_net->net_lnd->lnd_notify != NULL) { - spin_unlock(&lp->lpni_lock); - lnet_net_unlock(lp->lpni_cpt); - - /* A new notification could happen now; I'll handle it - * when control returns to me */ - - (ni->ni_net->net_lnd->lnd_notify)(ni, lp->lpni_nid, - alive); - - lnet_net_lock(lp->lpni_cpt); - spin_lock(&lp->lpni_lock); - } - } - - lp->lpni_notifying = 0; - spin_unlock(&lp->lpni_lock); -} - static void lnet_rtr_addref_locked(struct lnet_peer *lp) { @@ -744,89 +651,6 @@ lnet_get_route(int idx, __u32 *net, __u32 *hops, return -ENOENT; } -void -lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf) -{ - struct lnet_ni_status *stat; - int nnis; - int i; - - __swab32s(&pbuf->pb_info.pi_magic); - __swab32s(&pbuf->pb_info.pi_features); - __swab32s(&pbuf->pb_info.pi_pid); - __swab32s(&pbuf->pb_info.pi_nnis); - nnis = pbuf->pb_info.pi_nnis; - if (nnis > pbuf->pb_nnis) - nnis = pbuf->pb_nnis; - for (i = 0; i < nnis; i++) { - stat = &pbuf->pb_info.pi_ni[i]; - __swab64s(&stat->ns_nid); - __swab32s(&stat->ns_status); - } - return; -} - -/** - * TODO: re-implement - */ -static void -lnet_parse_rc_info(struct lnet_rc_data *rcd) -{ - rcd = rcd; -} - -static void -lnet_router_checker_event(struct lnet_event *event) -{ - struct lnet_rc_data *rcd = event->md.user_ptr; - struct lnet_peer_ni *lp; - - LASSERT(rcd != NULL); - - if (event->unlinked) { - LNetInvalidateMDHandle(&rcd->rcd_mdh); - return; - } - - LASSERT(event->type == LNET_EVENT_SEND || - event->type == LNET_EVENT_REPLY); - - lp = rcd->rcd_gateway; - LASSERT(lp != NULL); - - /* NB: it's called with holding lnet_res_lock, we have a few - * places need to hold both locks at the same time, please take - * care of lock ordering */ - lnet_net_lock(lp->lpni_cpt); - if (!lnet_isrouter(lp) || lp->lpni_rcd != rcd) { - /* ignore if no longer a router or rcd is replaced */ - goto out; - } - - if (event->type == LNET_EVENT_SEND) { - if (event->status == 0) - goto out; - } - - /* LNET_EVENT_REPLY */ - /* A successful REPLY means the router is up. If _any_ comms - * to the router fail I assume it's down (this will happen if - * we ping alive routers to try to detect router death before - * apps get burned). */ - - lnet_notify_locked(lp, 1, !event->status, ktime_get_seconds()); - /* The router checker will wake up very shortly and do the - * actual notification. - * XXX If 'lp' stops being a router before then, it will still - * have the notification pending!!! */ - - if (avoid_asym_router_failure && event->status == 0) - lnet_parse_rc_info(rcd); - - out: - lnet_net_unlock(lp->lpni_cpt); -} - static void lnet_wait_known_routerstate(void) { @@ -864,26 +688,6 @@ lnet_wait_known_routerstate(void) } } -/* TODO: reimplement */ -void -lnet_router_ni_update_locked(struct lnet_peer_ni *gw, __u32 net) -{ - struct lnet_route *rte; - struct lnet_peer *lp; - - if ((gw->lpni_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0) - lp = gw->lpni_peer_net->lpn_peer; - else - return; - - list_for_each_entry(rte, &lp->lp_routes, lr_gwlist) { - if (rte->lr_net == net) { - rte->lr_downis = 0; - break; - } - } -} - static void lnet_update_ni_status_locked(void) { @@ -924,27 +728,6 @@ lnet_update_ni_status_locked(void) } } -int lnet_router_pre_mt_start(void) -{ - int rc; - - if (check_routers_before_use && - dead_router_check_interval <= 0) { - LCONSOLE_ERROR_MSG(0x10a, "'dead_router_check_interval' must be" - " set if 'check_routers_before_use' is set" - "\n"); - return -EINVAL; - } - - rc = LNetEQAlloc(0, lnet_router_checker_event, &the_lnet.ln_rc_eqh); - if (rc != 0) { - CERROR("Can't allocate EQ(0): %d\n", rc); - return -ENOMEM; - } - - return 0; -} - void lnet_router_post_mt_start(void) { if (check_routers_before_use) { @@ -955,22 +738,6 @@ void lnet_router_post_mt_start(void) } } -void -lnet_router_cleanup(void) -{ - int rc; - - rc = LNetEQFree(the_lnet.ln_rc_eqh); - LASSERT(rc == 0); - return; -} - -void -lnet_prune_rc_data(int wait_unlink) -{ - wait_unlink = wait_unlink; -} - /* * This function is called from the monitor thread to check if there are * any active routers that need to be checked. @@ -986,11 +753,6 @@ lnet_router_checker_active(void) if (the_lnet.ln_routing) return true; - /* if there are routers that need to be cleaned up then do so */ - if (!list_empty(&the_lnet.ln_rcd_deathrow) || - !list_empty(&the_lnet.ln_rcd_zombie)) - return true; - return !list_empty(&the_lnet.ln_routers) && (live_router_check_interval > 0 || dead_router_check_interval > 0); @@ -1025,8 +787,6 @@ rescan: lnet_update_ni_status_locked(); lnet_net_unlock(cpt); - - lnet_prune_rc_data(0); /* don't wait for UNLINK */ } void @@ -1519,18 +1279,6 @@ lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, int alive, time64_t when) lnet_net_lock(cpt); } - /* We can't fully trust LND on reporting exact peer last_alive - * if he notifies us about dead peer. For example ksocklnd can - * call us with when == _time_when_the_node_was_booted_ if - * no connections were successfully established */ - if (ni != NULL && !alive && when < lp->lpni_last_alive) - when = lp->lpni_last_alive; - - lnet_notify_locked(lp, ni == NULL, alive, when); - - if (ni != NULL) - lnet_ni_notify_locked(ni, lp); - lnet_peer_ni_decref_locked(lp); lnet_net_unlock(cpt); diff --git a/lnet/lnet/router_proc.c b/lnet/lnet/router_proc.c index aaa12cc..5627e40 100644 --- a/lnet/lnet/router_proc.c +++ b/lnet/lnet/router_proc.c @@ -224,18 +224,18 @@ proc_lnet_routes(struct ctl_table *table, int write, void __user *buffer, } if (route != NULL) { - __u32 net = rnet->lrn_net; - __u32 hops = route->lr_hops; - unsigned int priority = route->lr_priority; - lnet_nid_t nid = route->lr_gateway->lp_primary_nid; - int alive = lnet_is_route_alive(route); + __u32 net = rnet->lrn_net; + __u32 hops = route->lr_hops; + unsigned int priority = route->lr_priority; + int alive = lnet_is_route_alive(route); s += snprintf(s, tmpstr + tmpsiz - s, "%-8s %4d %8u %7s %s\n", libcfs_net2str(net), hops, priority, alive ? "up" : "down", - libcfs_nid2str(nid)); + /* TODO: replace with actual nid */ + libcfs_nid2str(LNET_NID_ANY)); LASSERT(tmpstr + tmpsiz - s > 0); } @@ -291,10 +291,8 @@ proc_lnet_routers(struct ctl_table *table, int write, void __user *buffer, if (*ppos == 0) { s += snprintf(s, tmpstr + tmpsiz - s, - "%-4s %7s %9s %6s %12s %9s %8s %7s %s\n", - "ref", "rtr_ref", "alive_cnt", "state", - "last_ping", "ping_sent", "deadline", - "down_ni", "router"); + "%-4s %7s %5s %s\n", + "ref", "rtr_ref", "alive", "router"); LASSERT(tmpstr + tmpsiz - s > 0); lnet_net_lock(0); @@ -333,47 +331,15 @@ proc_lnet_routers(struct ctl_table *table, int write, void __user *buffer, if (peer != NULL) { lnet_nid_t nid = peer->lp_primary_nid; - time64_t now = ktime_get_seconds(); - /* TODO: readjust what's being printed */ - time64_t deadline = 0; int nrefs = atomic_read(&peer->lp_refcount); int nrtrrefs = peer->lp_rtr_refcount; - int alive_cnt = 0; int alive = lnet_is_gateway_alive(peer); - int pingsent = ((peer->lp_state & LNET_PEER_PING_SENT) - != 0); - time64_t last_ping = now - peer->lp_rtrcheck_timestamp; - int down_ni = 0; - struct lnet_route *rtr; - - if (nrtrrefs > 0) { - list_for_each_entry(rtr, &peer->lp_routes, - lr_gwlist) { - /* downis on any route should be the - * number of downis on the gateway */ - if (rtr->lr_downis != 0) { - down_ni = rtr->lr_downis; - break; - } - } - } - if (deadline == 0) - s += snprintf(s, tmpstr + tmpsiz - s, - "%-4d %7d %9d %6s %12llu %9d %8s %7d %s\n", - nrefs, nrtrrefs, alive_cnt, - alive ? "up" : "down", last_ping, - pingsent, "NA", down_ni, - libcfs_nid2str(nid)); - else - s += snprintf(s, tmpstr + tmpsiz - s, - "%-4d %7d %9d %6s %12llu %9d %8llu %7d %s\n", - nrefs, nrtrrefs, alive_cnt, - alive ? "up" : "down", last_ping, - pingsent, - deadline - now, - down_ni, libcfs_nid2str(nid)); - LASSERT(tmpstr + tmpsiz - s > 0); + s += snprintf(s, tmpstr + tmpsiz - s, + "%-4d %7d %5s %s\n", + nrefs, nrtrrefs, + alive ? "up" : "down", + libcfs_nid2str(nid)); } lnet_net_unlock(0); @@ -539,19 +505,6 @@ proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer, aliveness = lnet_is_peer_ni_alive(peer) ? "up" : "down"; - if (lnet_peer_aliveness_enabled(peer)) { - time64_t now = ktime_get_seconds(); - - lastalive = now - peer->lpni_last_alive; - - /* No need to mess up peers contents with - * arbitrarily long integers - it suffices to - * know that lastalive is more than 10000s old - */ - if (lastalive >= 10000) - lastalive = 9999; - } - lnet_net_unlock(cpt); s += snprintf(s, tmpstr + tmpsiz - s, diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index cbdb6bb..df88859 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -14759,8 +14759,8 @@ test_215() { # for bugs 18102, 21079, 21517 # where ref > 0, rtr_ref > 0, alive_cnt >= 0, state is up/down, # last_ping >= 0, ping_sent is boolean (0/1), deadline and down_ni are # numeric (0 or >0 or <0), router is a string like 192.168.1.1@tcp2 - L1="^ref +rtr_ref +alive_cnt +state +last_ping +ping_sent +deadline +down_ni +router$" - BR="^$P +$P +$N +(up|down) +$N +(0|1) +$I +$I +$NID$" + L1="^ref +rtr_ref +alive +router$" + BR="^$P +$P +(up|down) +$NID$" create_lnet_proc_files "routers" check_lnet_proc_entry "routers.sys" "lnet.routers" "$BR" "$L1" remove_lnet_proc_files "routers"