From: isaac Date: Tue, 1 Dec 2009 15:00:55 +0000 (+0000) Subject: i=maxim,b=18460,b=20171: X-Git-Tag: GIT_EPOCH_B_HD_KDMU~43 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=5e1e6a6756d3b4ca19a0d7e0defcf974dbfed13c i=maxim,b=18460,b=20171: - avoid asymmetrical router failures; monotonic timing source for router checker. --- diff --git a/lnet/ChangeLog b/lnet/ChangeLog index 84f18b2..8f7a42d 100644 --- a/lnet/ChangeLog +++ b/lnet/ChangeLog @@ -17,6 +17,15 @@ Bugzilla : Description: Details : +Severity : normal +Bugzilla : 20171 +Description: router checker stops working when system wall clock goes backward +Details : use monotonic timing source instead of system wall clock time. + +Severity : enhancement +Bugzilla : 18460 +Description: avoid asymmetrical router failures + Severity : enhancement Bugzilla : 19735 Description: multiple-instance support for kptllnd diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index 87a29c7..5332664 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -537,8 +537,8 @@ lnet_net2ni (__u32 net) return ni; } -int lnet_notify(lnet_ni_t *ni, lnet_nid_t peer, int alive, time_t when); -void lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, time_t when); +int lnet_notify(lnet_ni_t *ni, lnet_nid_t peer, int alive, cfs_time_t when); +void lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, cfs_time_t when); int lnet_add_route(__u32 net, unsigned int hops, lnet_nid_t gateway_nid); int lnet_check_routes(void); int lnet_del_route(__u32 net, lnet_nid_t gw_nid); @@ -673,8 +673,14 @@ void lnet_acceptor_stop(void); void lnet_get_tunables(void); int lnet_peers_start_down(void); int lnet_peer_buffer_credits(lnet_ni_t *ni); + +extern int router_ping_timeout; +extern int dead_router_check_interval; +extern int live_router_check_interval; int lnet_router_checker_start(void); void lnet_router_checker_stop(void); +void lnet_swap_pinginfo(lnet_ping_info_t *info); +int lnet_router_down_ni(lnet_peer_t *rtr, __u32 net); int lnet_ping_target_init(void); void lnet_ping_target_fini(void); diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 8e5f1a0..9efa5da 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -358,7 +358,7 @@ typedef struct lnet_lnd void (*lnd_notify)(struct lnet_ni *ni, lnet_nid_t peer, int alive); /* query of peer aliveness */ - void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, time_t *when); + void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, cfs_time_t *when); #if defined(__KERNEL__) || defined(HAVE_LIBPTHREAD) /* accept a new connection */ @@ -374,6 +374,15 @@ typedef struct lnet_lnd #endif } lnd_t; +#define LNET_NI_STATUS_UP 0x15aac0de +#define LNET_NI_STATUS_DOWN 0xdeadface +#define LNET_NI_STATUS_INVALID 0x00000000 +typedef struct { + lnet_nid_t ns_nid; + __u32 ns_status; + __u32 ns_unused; +} WIRE_ATTR lnet_ni_status_t; + #define LNET_MAX_INTERFACES 16 typedef struct lnet_ni { @@ -389,9 +398,31 @@ typedef struct lnet_ni { void *ni_data; /* instance-specific data */ lnd_t *ni_lnd; /* procedural interface */ int ni_refcount; /* reference count */ + cfs_time_t ni_last_alive; /* when I was last alive */ + lnet_ni_status_t *ni_status; /* my health status */ char *ni_interfaces[LNET_MAX_INTERFACES]; /* equivalent interfaces to use */ } lnet_ni_t; +#define LNET_PROTO_PING_MATCHBITS 0x8000000000000000LL +#define LNET_PROTO_PING_VERSION 2 +#define LNET_PROTO_PING_VERSION1 1 +typedef struct { + __u32 pi_magic; + __u32 pi_version; + lnet_pid_t pi_pid; + __u32 pi_nnis; + lnet_ni_status_t pi_ni[0]; +} WIRE_ATTR lnet_ping_info_t; + +/* router checker data, per router */ +#define LNET_MAX_RTR_NIS 16 +#define LNET_PINGINFO_SIZE offsetof(lnet_ping_info_t, pi_ni[LNET_MAX_RTR_NIS]) +typedef struct { + struct list_head rcd_list; /* chain on the_lnet.ln_zombie_rcd */ + lnet_handle_md_t rcd_mdh; /* ping buffer MD */ + lnet_ping_info_t *rcd_pinginfo; /* ping buffer */ +} lnet_rc_data_t; + typedef struct lnet_peer { struct list_head lp_hashlist; /* chain on peer hash */ struct list_head lp_txq; /* messages blocking for tx credits */ @@ -408,15 +439,16 @@ typedef struct lnet_peer { unsigned int lp_ping_notsent; /* SEND event outstanding from ping */ int lp_alive_count; /* # times router went dead<->alive */ long lp_txqnob; /* bytes queued for sending */ - time_t lp_timestamp; /* time of last aliveness news */ - time_t lp_last_alive; /* when I was last alive */ - time_t lp_last_query; /* when LND was queried last time */ - time_t lp_ping_timestamp; /* time of last ping attempt */ - time_t lp_ping_deadline; /* != 0 if ping reply expected */ + cfs_time_t lp_timestamp; /* time of last aliveness news */ + cfs_time_t lp_ping_timestamp; /* time of last ping attempt */ + cfs_time_t lp_ping_deadline; /* != 0 if ping reply expected */ + cfs_time_t lp_last_alive; /* when I was last alive */ + cfs_time_t lp_last_query; /* when lp_ni was queried last time */ lnet_ni_t *lp_ni; /* interface peer is on */ lnet_nid_t lp_nid; /* peer's NID */ int lp_refcount; /* # refs */ int lp_rtr_refcount; /* # refs from lnet_route_t::lr_gateway */ + lnet_rc_data_t *lp_rcd; /* router checker state */ } lnet_peer_t; typedef struct { @@ -466,16 +498,6 @@ typedef struct { #define LNET_NRBPOOLS 3 /* # different router buffer pools */ -#define LNET_PROTO_PING_MATCHBITS 0x8000000000000000LL -#define LNET_PROTO_PING_VERSION 1 -typedef struct { - __u32 pi_magic; - __u32 pi_version; - lnet_pid_t pi_pid; - __u32 pi_nnids; - lnet_nid_t pi_nid[0]; -} WIRE_ATTR lnet_ping_info_t; - /* Options for lnet_portal_t::ptl_options */ #define LNET_PTL_LAZY (1 << 0) typedef struct { @@ -576,6 +598,7 @@ typedef struct int ln_rc_state; /* router checker startup/shutdown state */ lnet_handle_eq_t ln_rc_eqh; /* router checker's event queue */ lnet_handle_md_t ln_rc_mdh; + struct list_head ln_zombie_rcd; #ifdef LNET_USE_LIB_FREELIST lnet_freelist_t ln_free_mes; diff --git a/lnet/klnds/mxlnd/mxlnd_cb.c b/lnet/klnds/mxlnd/mxlnd_cb.c index 14b49ed..6dee555 100644 --- a/lnet/klnds/mxlnd/mxlnd_cb.c +++ b/lnet/klnds/mxlnd/mxlnd_cb.c @@ -484,8 +484,7 @@ mxlnd_conn_disconnect(struct kmx_conn *conn, int mx_dis, int send_bye) } if (kmxlnd_data.kmx_shutdown != 1) { - time_t last_alive = 0; - unsigned long last_msg = 0; + unsigned long last_msg = 0; /* notify LNET that we are giving up on this peer */ if (time_after(conn->mxk_last_rx, conn->mxk_last_tx)) @@ -493,9 +492,7 @@ mxlnd_conn_disconnect(struct kmx_conn *conn, int mx_dis, int send_bye) else last_msg = conn->mxk_last_tx; - last_alive = cfs_time_current_sec() - - cfs_duration_sec(cfs_time_current() - last_msg); - lnet_notify(kmxlnd_data.kmx_ni, conn->mxk_peer->mxp_nid, 0, last_alive); + lnet_notify(kmxlnd_data.kmx_ni, conn->mxk_peer->mxp_nid, 0, last_msg); if (mx_dis && valid) mx_disconnect(kmxlnd_data.kmx_endpt, epa); diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c index 419448d..d10041a 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.c +++ b/lnet/klnds/o2iblnd/o2iblnd.c @@ -1075,7 +1075,7 @@ kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) } void -kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, time_t *when) +kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when) { cfs_time_t last_alive = 0; rwlock_t *glock = &kiblnd_data.kib_global_lock; @@ -1095,8 +1095,7 @@ kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, time_t *when) read_unlock_irqrestore(glock, flags); if (last_alive != 0) - *when = cfs_time_current_sec() - - cfs_duration_sec(cfs_time_current() - last_alive); + *when = last_alive; /* peer is not persistent in hash, trigger peer creation * and connection establishment with a NULL tx */ diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index 7d81190..8ccb4df 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -942,7 +942,7 @@ void kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr); int kiblnd_startup (lnet_ni_t *ni); void kiblnd_shutdown (lnet_ni_t *ni); int kiblnd_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg); -void kiblnd_query (struct lnet_ni *ni, lnet_nid_t nid, time_t *when); +void kiblnd_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when); int kiblnd_tunables_init(void); void kiblnd_tunables_fini(void); diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index ec34090..01c4621 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -1724,8 +1724,8 @@ kiblnd_peer_alive (kib_peer_t *peer) void kiblnd_peer_notify (kib_peer_t *peer) { - time_t last_alive = 0; int error = 0; + cfs_time_t last_alive = 0; unsigned long flags; read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); @@ -1737,9 +1737,7 @@ kiblnd_peer_notify (kib_peer_t *peer) error = peer->ibp_error; peer->ibp_error = 0; - last_alive = cfs_time_current_sec() - - cfs_duration_sec(cfs_time_current() - - peer->ibp_last_alive); + last_alive = peer->ibp_last_alive; } read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); diff --git a/lnet/klnds/ptllnd/ptllnd.c b/lnet/klnds/ptllnd/ptllnd.c index 6562561..0abaa37 100755 --- a/lnet/klnds/ptllnd/ptllnd.c +++ b/lnet/klnds/ptllnd/ptllnd.c @@ -482,7 +482,7 @@ kptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) } void -kptllnd_query (lnet_ni_t *ni, lnet_nid_t nid, time_t *when) +kptllnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when) { kptl_net_t *net = ni->ni_data; kptl_peer_t *peer = NULL; @@ -495,9 +495,7 @@ kptllnd_query (lnet_ni_t *ni, lnet_nid_t nid, time_t *when) spin_lock_irqsave(&peer->peer_lock, flags); if (peer->peer_last_alive != 0) - *when = cfs_time_current_sec() - - cfs_duration_sec(cfs_time_current() - - peer->peer_last_alive); + *when = peer->peer_last_alive; spin_unlock_irqrestore(&peer->peer_lock, flags); kptllnd_peer_decref(peer); return; diff --git a/lnet/klnds/ptllnd/ptllnd.h b/lnet/klnds/ptllnd/ptllnd.h index e747812..49b90d3 100755 --- a/lnet/klnds/ptllnd/ptllnd.h +++ b/lnet/klnds/ptllnd/ptllnd.h @@ -362,7 +362,7 @@ kptllnd_schedule_ptltrace_dump (void) int kptllnd_startup(lnet_ni_t *ni); void kptllnd_shutdown(lnet_ni_t *ni); int kptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); -void kptllnd_query (struct lnet_ni *ni, lnet_nid_t nid, time_t *when); +void kptllnd_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when); int kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); int kptllnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, unsigned int niov, diff --git a/lnet/klnds/ptllnd/ptllnd_peer.c b/lnet/klnds/ptllnd/ptllnd_peer.c index 5d659d8..62ff41f 100644 --- a/lnet/klnds/ptllnd/ptllnd_peer.c +++ b/lnet/klnds/ptllnd/ptllnd_peer.c @@ -289,17 +289,14 @@ kptllnd_peer_notify (kptl_peer_t *peer) int i = 0; int nnets = 0; int error = 0; - time_t last_alive = 0; + cfs_time_t last_alive = 0; spin_lock_irqsave(&peer->peer_lock, flags); if (peer->peer_error != 0) { error = peer->peer_error; peer->peer_error = 0; - - last_alive = cfs_time_current_sec() - - cfs_duration_sec(cfs_time_current() - - peer->peer_last_alive); + last_alive = peer->peer_last_alive; } spin_unlock_irqrestore(&peer->peer_lock, flags); diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index a616be1..15bb8cf 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -1495,8 +1495,8 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error) void ksocknal_peer_failed (ksock_peer_t *peer) { - time_t last_alive = 0; - int notify = 0; + int notify = 0; + cfs_time_t last_alive = 0; /* There has been a connection failure or comms error; but I'll only * tell LNET I think the peer is dead if it's to another kernel and @@ -1509,9 +1509,7 @@ ksocknal_peer_failed (ksock_peer_t *peer) peer->ksnp_accepting == 0 && ksocknal_find_connecting_route_locked(peer) == NULL) { notify = 1; - last_alive = (time_t) (cfs_time_current_sec() - - cfs_duration_sec(cfs_time_current() - - peer->ksnp_last_alive)); + last_alive = peer->ksnp_last_alive; } cfs_read_unlock (&ksocknal_data.ksnd_global_lock); @@ -1792,7 +1790,7 @@ ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive) } void -ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, time_t *when) +ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when) { int connect = 1; cfs_time_t last_alive = 0; @@ -1829,8 +1827,7 @@ ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, time_t *when) read_unlock(glock); if (last_alive != 0) - *when = cfs_time_current_sec() - - cfs_duration_sec(cfs_time_current() - last_alive); + *when = last_alive; if (!connect) return; diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index e4386ab..a4cec42 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -548,7 +548,7 @@ extern void ksocknal_next_tx_carrier(ksock_conn_t *conn); extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn); extern void ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int error); extern void ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive); -extern void ksocknal_query (struct lnet_ni *ni, lnet_nid_t nid, time_t *when); +extern void ksocknal_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when); extern int ksocknal_thread_start (int (*fn)(void *arg), void *arg); extern void ksocknal_thread_fini (void); extern void ksocknal_launch_all_connections_locked (ksock_peer_t *peer); diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 86541a7..75cbbcc 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -1120,9 +1120,11 @@ LNetInit(void) memset(&the_lnet, 0, sizeof(the_lnet)); lnet_init_locks(); - CFS_INIT_LIST_HEAD(&the_lnet.ln_lnds); the_lnet.ln_refcount = 0; the_lnet.ln_init = 1; + LNetInvalidateHandle(&the_lnet.ln_rc_eqh); + CFS_INIT_LIST_HEAD(&the_lnet.ln_lnds); + CFS_INIT_LIST_HEAD(&the_lnet.ln_zombie_rcd); #ifdef __KERNEL__ /* All LNDs apart from the LOLND are in separate modules. They @@ -1207,11 +1209,13 @@ LNetNIInit(lnet_pid_t requested_pid) the_lnet.ln_refcount = 1; /* Now I may use my own API functions... */ - rc = lnet_router_checker_start(); + /* NB router checker needs the_lnet.ln_ping_info in + * lnet_router_checker -> lnet_update_ni_status */ + rc = lnet_ping_target_init(); if (rc != 0) goto failed3; - rc = lnet_ping_target_init(); + rc = lnet_router_checker_start(); if (rc != 0) goto failed4; @@ -1219,7 +1223,7 @@ LNetNIInit(lnet_pid_t requested_pid) goto out; failed4: - lnet_router_checker_stop(); + lnet_ping_target_fini(); failed3: the_lnet.ln_refcount = 0; lnet_acceptor_stop(); @@ -1249,8 +1253,8 @@ LNetNIFini() LASSERT (!the_lnet.ln_niinit_self); lnet_proc_fini(); - lnet_ping_target_fini(); lnet_router_checker_stop(); + lnet_ping_target_fini(); /* Teardown fns that use my own API functions BEFORE here */ the_lnet.ln_refcount = 0; @@ -1299,7 +1303,9 @@ LNetCtl(unsigned int cmd, void *arg) &data->ioc_nid, &data->ioc_flags); case IOC_LIBCFS_NOTIFY_ROUTER: return lnet_notify(NULL, data->ioc_nid, data->ioc_flags, - (time_t)data->ioc_u64[0]); + cfs_time_current() - + cfs_time_seconds(cfs_time_current_sec() - + (time_t)data->ioc_u64[0])); case IOC_LIBCFS_PORTALS_COMPATIBILITY: /* This can be removed once lustre stops calling it */ @@ -1404,18 +1410,16 @@ LNetSnprintHandle(char *str, int len, lnet_handle_any_t h) snprintf(str, len, LPX64, h.cookie); } - -int -lnet_ping_target_init(void) +static int +lnet_create_ping_info(void) { - lnet_handle_me_t meh; - lnet_process_id_t id; - lnet_md_t md = {0}; - int rc; - int rc2; + int i; int n; + int rc; unsigned int infosz; - int i; + lnet_ni_t *ni; + lnet_process_id_t id; + lnet_ping_info_t *pinfo; for (n = 0; ; n++) { rc = LNetGetId(n, &id); @@ -1425,24 +1429,76 @@ lnet_ping_target_init(void) LASSERT (rc == 0); } - infosz = offsetof(lnet_ping_info_t, pi_nid[n]); - LIBCFS_ALLOC(the_lnet.ln_ping_info, infosz); - if (the_lnet.ln_ping_info == NULL) { + infosz = offsetof(lnet_ping_info_t, pi_ni[n]); + LIBCFS_ALLOC(pinfo, infosz); + if (pinfo == NULL) { CERROR("Can't allocate ping info[%d]\n", n); return -ENOMEM; } - the_lnet.ln_ping_info->pi_magic = LNET_PROTO_PING_MAGIC; - the_lnet.ln_ping_info->pi_version = LNET_PROTO_PING_VERSION; - the_lnet.ln_ping_info->pi_pid = the_lnet.ln_pid; - the_lnet.ln_ping_info->pi_nnids = n; + pinfo->pi_nnis = n; + pinfo->pi_pid = the_lnet.ln_pid; + pinfo->pi_magic = LNET_PROTO_PING_MAGIC; + pinfo->pi_version = LNET_PROTO_PING_VERSION; for (i = 0; i < n; i++) { + lnet_ni_status_t *ns = &pinfo->pi_ni[i]; + rc = LNetGetId(i, &id); LASSERT (rc == 0); - the_lnet.ln_ping_info->pi_nid[i] = id.nid; + + ns->ns_nid = id.nid; + ns->ns_status = LNET_NI_STATUS_UP; + + LNET_LOCK(); + + ni = lnet_nid2ni_locked(id.nid); + LASSERT (ni != NULL); + LASSERT (ni->ni_status == NULL); + ni->ni_status = ns; + lnet_ni_decref_locked(ni); + + LNET_UNLOCK(); } + the_lnet.ln_ping_info = pinfo; + return 0; +} + +static void +lnet_destroy_ping_info(void) +{ + lnet_ni_t *ni; + + LNET_LOCK(); + + list_for_each_entry (ni, &the_lnet.ln_nis, ni_list) { + ni->ni_status = NULL; + } + + LNET_UNLOCK(); + + LIBCFS_FREE(the_lnet.ln_ping_info, + offsetof(lnet_ping_info_t, + pi_ni[the_lnet.ln_ping_info->pi_nnis])); + the_lnet.ln_ping_info = NULL; + return; +} + +int +lnet_ping_target_init(void) +{ + lnet_md_t md = {0}; + lnet_handle_me_t meh; + lnet_process_id_t id; + int rc; + int rc2; + int infosz; + + rc = lnet_create_ping_info(); + if (rc != 0) + return rc; + /* We can have a tiny EQ since we only need to see the unlink event on * teardown, which by definition is the last one! */ rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &the_lnet.ln_ping_target_eq); @@ -1465,6 +1521,8 @@ lnet_ping_target_init(void) } /* initialize md content */ + infosz = offsetof(lnet_ping_info_t, + pi_ni[the_lnet.ln_ping_info->pi_nnis]); md.start = the_lnet.ln_ping_info; md.length = infosz; md.threshold = LNET_MD_THRESH_INF; @@ -1491,8 +1549,7 @@ lnet_ping_target_init(void) rc2 = LNetEQFree(the_lnet.ln_ping_target_eq); LASSERT (rc2 == 0); failed_0: - LIBCFS_FREE(the_lnet.ln_ping_info, infosz); - + lnet_destroy_ping_info(); return rc; } @@ -1529,11 +1586,7 @@ lnet_ping_target_fini(void) rc = LNetEQFree(the_lnet.ln_ping_target_eq); LASSERT (rc == 0); - - LIBCFS_FREE(the_lnet.ln_ping_info, - offsetof(lnet_ping_info_t, - pi_nid[the_lnet.ln_ping_info->pi_nnids])); - + lnet_destroy_ping_info(); cfs_restore_sigs(blocked); } @@ -1548,7 +1601,7 @@ lnet_ping (lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_i int unlinked = 0; int replied = 0; const int a_long_time = 60000; /* mS */ - int infosz = offsetof(lnet_ping_info_t, pi_nid[n_ids]); + int infosz = offsetof(lnet_ping_info_t, pi_ni[n_ids]); lnet_ping_info_t *info; lnet_process_id_t tmpid; int i; @@ -1642,7 +1695,6 @@ lnet_ping (lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_i CWARN("ping %s: late network completion\n", libcfs_id2str(id)); } - } else if (event.type == LNET_EVENT_REPLY) { replied = 1; rc = event.mlength; @@ -1671,14 +1723,7 @@ lnet_ping (lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_i } if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) { - /* NB I might be swabbing garbage until I check below, but it - * doesn't matter */ - __swab32s(&info->pi_version); - __swab32s(&info->pi_pid); - __swab32s(&info->pi_nnids); - for (i = 0; i < (int)info->pi_nnids && i < (int)n_ids; i++) - __swab64s(&info->pi_nid[i]); - + lnet_swap_pinginfo(info); } else if (info->pi_magic != LNET_PROTO_PING_MAGIC) { CERROR("%s: Unexpected magic %08x\n", libcfs_id2str(id), info->pi_magic); @@ -1691,18 +1736,18 @@ lnet_ping (lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_i goto out_1; } - if (nob < (int)offsetof(lnet_ping_info_t, pi_nid[0])) { + if (nob < offsetof(lnet_ping_info_t, pi_ni[0])) { CERROR("%s: Short reply %d(%d min)\n", libcfs_id2str(id), - nob, (int)offsetof(lnet_ping_info_t, pi_nid[0])); + nob, (int)offsetof(lnet_ping_info_t, pi_ni[0])); goto out_1; } - if ((int) info->pi_nnids < n_ids) - n_ids = info->pi_nnids; + if (info->pi_nnis < n_ids) + n_ids = info->pi_nnis; - if (nob < (int)offsetof(lnet_ping_info_t, pi_nid[n_ids])) { + if (nob < offsetof(lnet_ping_info_t, pi_ni[n_ids])) { CERROR("%s: Short reply %d(%d expected)\n", libcfs_id2str(id), - nob, (int)offsetof(lnet_ping_info_t, pi_nid[n_ids])); + nob, (int)offsetof(lnet_ping_info_t, pi_ni[n_ids])); goto out_1; } @@ -1710,7 +1755,7 @@ lnet_ping (lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_i for (i = 0; i < n_ids; i++) { tmpid.pid = info->pi_pid; - tmpid.nid = info->pi_nid[i]; + tmpid.nid = info->pi_ni[i].ns_nid; #ifdef __KERNEL__ if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid))) goto out_1; @@ -1718,7 +1763,7 @@ lnet_ping (lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_i ids[i] = tmpid; #endif } - rc = info->pi_nnids; + rc = info->pi_nnis; out_1: rc2 = LNetEQFree(eqh); diff --git a/lnet/lnet/config.c b/lnet/lnet/config.c index 45bd89f..f8d6dea 100644 --- a/lnet/lnet/config.c +++ b/lnet/lnet/config.c @@ -117,7 +117,7 @@ lnet_net_unique(__u32 net, struct list_head *nilist) if (LNET_NIDNET(ni->ni_nid) == net) return 0; } - + return 1; } @@ -131,20 +131,21 @@ lnet_new_ni(__u32 net, struct list_head *nilist) libcfs_net2str(net)); return NULL; } - + LIBCFS_ALLOC(ni, sizeof(*ni)); if (ni == NULL) { CERROR("Out of memory creating network %s\n", libcfs_net2str(net)); return NULL; } - + /* zero counters/flags, NULL pointers... */ memset(ni, 0, sizeof(*ni)); /* LND will fill in the address part of the NID */ ni->ni_nid = LNET_MKNID(net, 0); CFS_INIT_LIST_HEAD(&ni->ni_txq); + ni->ni_last_alive = cfs_time_current(); list_add_tail(&ni->ni_list, nilist); return ni; diff --git a/lnet/lnet/lib-md.c b/lnet/lnet/lib-md.c index 6c988f2..6e64ad2 100644 --- a/lnet/lnet/lib-md.c +++ b/lnet/lnet/lib-md.c @@ -213,6 +213,24 @@ lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd) } int +lnet_md_validate(lnet_md_t *umd) +{ + if (umd->start == NULL) { + CERROR("MD start pointer can not be NULL\n"); + return -EINVAL; + } + + if ((umd->options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 && + umd->length > LNET_MAX_IOV) { + CERROR("Invalid option: too many fragments %d, %d max\n", + umd->length, LNET_MAX_IOV); + return -EINVAL; + } + + return 0; +} + +int LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd, lnet_unlink_t unlink, lnet_handle_md_t *handle) { @@ -223,12 +241,13 @@ LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd, LASSERT (the_lnet.ln_init); LASSERT (the_lnet.ln_refcount > 0); - if ((umd.options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 && - umd.length > LNET_MAX_IOV) /* too many fragments */ + if (lnet_md_validate(&umd) != 0) return -EINVAL; - if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) == 0) + if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) == 0) { + CERROR("Invalid option: no MD_OP set\n"); return -EINVAL; + } md = lnet_md_alloc(&umd); if (md == NULL) @@ -274,12 +293,13 @@ LNetMDBind(lnet_md_t umd, lnet_unlink_t unlink, lnet_handle_md_t *handle) LASSERT (the_lnet.ln_init); LASSERT (the_lnet.ln_refcount > 0); - if ((umd.options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 && - umd.length > LNET_MAX_IOV) /* too many fragments */ + if (lnet_md_validate(&umd) != 0) return -EINVAL; - if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) != 0) + if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) != 0) { + CERROR("Invalid option: GET|PUT illegal on active MDs\n"); return -EINVAL; + } md = lnet_md_alloc(&umd); if (md == NULL) diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index f31b0fe..743146c 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -903,7 +903,7 @@ lnet_eager_recv_locked(lnet_msg_t *msg) void lnet_ni_peer_alive(lnet_peer_t *lp) { - time_t last_alive = 0; + cfs_time_t last_alive = 0; lnet_ni_t *ni = lp->lp_ni; LASSERT (ni != NULL); @@ -914,7 +914,7 @@ lnet_ni_peer_alive(lnet_peer_t *lp) (ni->ni_lnd->lnd_query)(ni, lp->lp_nid, &last_alive); LNET_LOCK(); - lp->lp_last_query = cfs_time_current_sec(); + lp->lp_last_query = cfs_time_current(); if (last_alive != 0) /* NI has updated timestamp */ lp->lp_last_alive = last_alive; @@ -923,29 +923,34 @@ lnet_ni_peer_alive(lnet_peer_t *lp) /* NB: always called with LNET_LOCK held */ static inline int -lnet_peer_is_alive (lnet_peer_t *lp, time_t now) +lnet_peer_is_alive (lnet_peer_t *lp, cfs_time_t now) { lnet_ni_t *ni = lp->lp_ni; - time_t deadline; + cfs_time_t deadline; int alive; LASSERT (ni != NULL); LASSERT (ni->ni_peertimeout > 0); + /* Trust lnet_notify() if it has more recent aliveness news, but + * ignore the initial assumed death (see lnet_peers_start_down()). + */ if (!lp->lp_alive && lp->lp_alive_count > 0 && cfs_time_aftereq(lp->lp_timestamp, lp->lp_last_alive)) - return 0; + return 0; - deadline = cfs_time_add(lp->lp_last_alive, ni->ni_peertimeout); + deadline = cfs_time_add(lp->lp_last_alive, + cfs_time_seconds(ni->ni_peertimeout)); alive = cfs_time_after(deadline, now); - if (alive && !lp->lp_alive) /* update obsolete lp_alive */ + + /* Update obsolete lp_alive */ + if (alive && !lp->lp_alive && lp->lp_timestamp != 0 && + cfs_time_before(lp->lp_timestamp, lp->lp_last_alive)) lnet_notify_locked(lp, 0, 1, lp->lp_last_alive); return alive; } -/* don't query LND about aliveness of a dead peer more frequently than: */ -static int lnet_queryinterval = 1; /* 1 second */ /* NB: returns 1 when alive, 0 when dead, negative when error; * may drop the LNET_LOCK */ @@ -953,7 +958,7 @@ int lnet_peer_alive_locked (lnet_peer_t *lp) { lnet_ni_t *ni = lp->lp_ni; - time_t now = cfs_time_current_sec(); + cfs_time_t now = cfs_time_current(); LASSERT (ni != NULL); @@ -963,24 +968,27 @@ lnet_peer_alive_locked (lnet_peer_t *lp) if (lnet_peer_is_alive(lp, now)) return 1; - /* peer appears dead, should we query right now? */ + /* Peer appears dead, but we should avoid frequent NI queries (at + * most once per lnet_queryinterval seconds). */ if (lp->lp_last_query != 0) { - time_t deadline = - cfs_time_add(lp->lp_last_query, - lnet_queryinterval); + static const int lnet_queryinterval = 1; - if (cfs_time_before(now, deadline)) { + cfs_time_t next_query = + cfs_time_add(lp->lp_last_query, + cfs_time_seconds(lnet_queryinterval)); + + if (cfs_time_before(now, next_query)) { if (lp->lp_alive) CWARN("Unexpected aliveness of peer %s: " "%d < %d (%d/%d)\n", libcfs_nid2str(lp->lp_nid), - (int)now, (int)deadline, + (int)now, (int)next_query, lnet_queryinterval, ni->ni_peertimeout); return 0; } } - /* query LND for latest aliveness news */ + /* query NI for latest aliveness news */ lnet_ni_peer_alive(lp); if (lnet_peer_is_alive(lp, now)) @@ -1392,6 +1400,7 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg) lp2 = route->lr_gateway; if (lp2->lp_alive && + lnet_router_down_ni(lp2, rnet->lrn_net) <= 0 && (src_ni == NULL || lp2->lp_ni == src_ni) && (lp == NULL || lnet_compare_routers(lp2, lp) > 0)) { best_route = route; @@ -2097,7 +2106,6 @@ lnet_print_hdr(lnet_hdr_t * hdr) } - int lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, void *private, int rdma_req) @@ -2154,6 +2162,19 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, return -EPROTO; } + if (the_lnet.ln_routing) { + cfs_time_t now = cfs_time_current(); + + LNET_LOCK(); + + ni->ni_last_alive = now; + if (ni->ni_status != NULL && + ni->ni_status->ns_status == LNET_NI_STATUS_DOWN) + ni->ni_status->ns_status = LNET_NI_STATUS_UP; + + LNET_UNLOCK(); + } + /* Regard a bad destination NID as a protocol error. Senders should * know what they're doing; if they don't they're misconfigured, buggy * or malicious so we chop them off at the knees :) */ diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index d39507b..fd7d31c 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -122,6 +122,7 @@ lnet_destroy_peer_locked (lnet_peer_t *lp) LASSERT (lp->lp_rtr_refcount == 0); LASSERT (list_empty(&lp->lp_txq)); LASSERT (lp->lp_txqnob == 0); + LASSERT (lp->lp_rcd == NULL); LIBCFS_FREE(lp, sizeof(*lp)); @@ -186,8 +187,8 @@ lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid) lp->lp_alive_count = 0; lp->lp_timestamp = 0; lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */ - lp->lp_last_alive = cfs_time_current_sec(); /* assumes alive */ - lp->lp_last_query = 0; /* didn't ask LND yet */ + lp->lp_last_alive = cfs_time_current(); /* assumes alive */ + lp->lp_last_query = 0; /* haven't asked NI yet */ lp->lp_ping_timestamp = 0; lp->lp_nid = nid; lp->lp_refcount = 2; /* 1 for caller; 1 for hash */ diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index 9714c21..b89c5f3 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -77,15 +77,19 @@ static int check_routers_before_use = 0; CFS_MODULE_PARM(check_routers_before_use, "i", int, 0444, "Assume routers are down and ping them before use"); -static int dead_router_check_interval = 0; +static int avoid_asym_router_failure = 0; +CFS_MODULE_PARM(avoid_asym_router_failure, "i", int, 0444, + "Avoid asymmetrical failures: reserved, use at your own risk"); + +int dead_router_check_interval = 0; CFS_MODULE_PARM(dead_router_check_interval, "i", int, 0444, "Seconds between dead router health checks (<= 0 to disable)"); -static int live_router_check_interval = 0; +int live_router_check_interval = 0; CFS_MODULE_PARM(live_router_check_interval, "i", int, 0444, "Seconds between live router health checks (<= 0 to disable)"); -static int router_ping_timeout = 50; +int router_ping_timeout = 50; CFS_MODULE_PARM(router_ping_timeout, "i", int, 0444, "Seconds to wait for the reply to a router health query"); @@ -96,9 +100,9 @@ lnet_peers_start_down(void) } void -lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, time_t when) +lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, cfs_time_t when) { - if (when < lp->lp_timestamp) { /* out of date information */ + if (cfs_time_before(when, lp->lp_timestamp)) { /* out of date information */ CDEBUG(D_NET, "Out of date\n"); return; } @@ -201,6 +205,12 @@ lnet_rtr_decref_locked(lnet_peer_t *lp) lp->lp_rtr_refcount--; if (lp->lp_rtr_refcount == 0) { + if (lp->lp_rcd != NULL) { + list_add(&lp->lp_rcd->rcd_list, + &the_lnet.ln_zombie_rcd); + lp->lp_rcd = NULL; + } + list_del(&lp->lp_rtr_list); /* decref for the_lnet.ln_routers */ lnet_peer_decref_locked(lp); @@ -496,6 +506,103 @@ lnet_get_route (int idx, __u32 *net, __u32 *hops, } void +lnet_swap_pinginfo(lnet_ping_info_t *info) +{ + int i; + lnet_ni_status_t *stat; + + __swab32s(&info->pi_version); + __swab32s(&info->pi_pid); + __swab32s(&info->pi_nnis); + for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) { + stat = &info->pi_ni[i]; + __swab64s(&stat->ns_nid); + __swab32s(&stat->ns_status); + } + return; +} + +/* Returns # of down NIs, or negative error codes; ignore downed NIs + * if a NI in 'net' is up */ +int +lnet_router_down_ni(lnet_peer_t *rtr, __u32 net) +{ + int i; + int down = 0; + int ptl_up = 0; + int ptl_down = 0; + lnet_ping_info_t *info; + + if (!avoid_asym_router_failure) + return -ENOENT; + + if (rtr->lp_rcd == NULL) + return -EINVAL; + + if (!rtr->lp_alive) + return -EINVAL; /* stale lp_rcd */ + + info = rtr->lp_rcd->rcd_pinginfo; + LASSERT (info != NULL); + + /* NB always racing with network! */ + if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) { + lnet_swap_pinginfo(info); + } else if (info->pi_magic != LNET_PROTO_PING_MAGIC) { + CDEBUG(D_NETERROR, "%s: Unexpected magic %08x\n", + libcfs_nid2str(rtr->lp_nid), info->pi_magic); + return -EPROTO; + } + + if (info->pi_version == LNET_PROTO_PING_VERSION1) + return -ENOENT; /* v1 doesn't carry NI status info */ + + if (info->pi_version != LNET_PROTO_PING_VERSION) { + CDEBUG(D_NETERROR, "%s: Unexpected version 0x%x\n", + libcfs_nid2str(rtr->lp_nid), info->pi_version); + return -EPROTO; + } + + for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) { + lnet_ni_status_t *stat = &info->pi_ni[i]; + lnet_nid_t nid = stat->ns_nid; + + if (nid == LNET_NID_ANY) { + CDEBUG(D_NETERROR, "%s: unexpected LNET_NID_ANY\n", + libcfs_nid2str(rtr->lp_nid)); + return -EPROTO; + } + + if (LNET_NETTYP(LNET_NIDNET(nid)) == LOLND) + continue; + + if (stat->ns_status == LNET_NI_STATUS_DOWN) { + if (LNET_NETTYP(LNET_NIDNET(nid)) == PTLLND) + ptl_down = 1; + else + down++; + continue; + } + + if (stat->ns_status != LNET_NI_STATUS_UP) { + CDEBUG(D_NETERROR, "%s: Unexpected status 0x%x\n", + libcfs_nid2str(rtr->lp_nid), stat->ns_status); + return -EPROTO; + } + + /* ignore downed NIs if there's a NI up for dest network */ + if (LNET_NIDNET(nid) == net) + return 0; + + if (LNET_NETTYP(LNET_NIDNET(nid)) == PTLLND) + ptl_up = 1; + } + + /* ptl NIs are considered down only when they're all down */ + return down + (ptl_up ? 0 : ptl_down); +} + +void lnet_wait_known_routerstate(void) { lnet_peer_t *rtr; @@ -535,11 +642,17 @@ lnet_router_checker_event (lnet_event_t *event) /* CAVEAT EMPTOR: I'm called with LNET_LOCKed and I'm not allowed to * drop it (that's how come I see _every_ event, even ones that would * overflow my EQ) */ - lnet_peer_t *lp; - lnet_nid_t nid; + lnet_rc_data_t *rcd = event->md.user_ptr; + lnet_peer_t *lp; + lnet_nid_t nid; if (event->unlinked) { - /* The router checker thread has unlinked the rc_md + if (rcd != NULL) { + LNetInvalidateHandle(&rcd->rcd_mdh); + return; + } + + /* The router checker thread has unlinked the default rc_md * and exited. */ LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_UNLINKING); the_lnet.ln_rc_state = LNET_RC_STATE_UNLINKED; @@ -575,7 +688,7 @@ lnet_router_checker_event (lnet_event_t *event) * apps get burned). */ lnet_notify_locked(lp, 1, (event->status == 0), - cfs_time_current_sec()); + cfs_time_current()); /* The router checker will wake up very shortly and do the * actual notification. @@ -591,6 +704,100 @@ lnet_router_checker_event (lnet_event_t *event) lnet_peer_decref_locked(lp); } +void +lnet_update_ni_status(void) +{ + cfs_time_t now = cfs_time_current(); + lnet_ni_t *ni; + int status; + int timeout; + + LASSERT (the_lnet.ln_routing); + + timeout = router_ping_timeout + + MAX(live_router_check_interval, dead_router_check_interval); + + LNET_LOCK(); + + list_for_each_entry (ni, &the_lnet.ln_nis, ni_list) { + lnet_ni_status_t *ns = ni->ni_status; + + LASSERT (ns != NULL); + + status = LNET_NI_STATUS_UP; + if (ni->ni_lnd->lnd_type != LOLND && /* @lo forever alive */ + cfs_time_after(now, cfs_time_add(ni->ni_last_alive, + cfs_time_seconds(timeout)))) + status = LNET_NI_STATUS_DOWN; + + if (ns->ns_status != status) { + ns->ns_status = status; + CDEBUG(D_NET, "NI(%s:%d) status changed to %s\n", + libcfs_nid2str(ni->ni_nid), timeout, + status == LNET_NI_STATUS_UP ? "up" : "down"); + } + } + + LNET_UNLOCK(); +} + +void +lnet_destroy_rc_data (lnet_rc_data_t *rcd) +{ + LASSERT (list_empty(&rcd->rcd_list)); + /* detached from network */ + LASSERT (LNetHandleIsInvalid(rcd->rcd_mdh)); + + LIBCFS_FREE(rcd->rcd_pinginfo, LNET_PINGINFO_SIZE); + LIBCFS_FREE(rcd, sizeof(*rcd)); + return; +} + +lnet_rc_data_t * +lnet_create_rc_data (void) +{ + int i; + int rc; + lnet_ping_info_t *pi; + lnet_rc_data_t *rcd; + + LIBCFS_ALLOC(rcd, sizeof(*rcd)); + if (rcd == NULL) + return NULL; + + LIBCFS_ALLOC(pi, LNET_PINGINFO_SIZE); + if (pi == NULL) { + LIBCFS_FREE(rcd, sizeof(*rcd)); + return NULL; + } + + memset(pi, 0, LNET_PINGINFO_SIZE); + for (i = 0; i < LNET_MAX_RTR_NIS; i++) { + pi->pi_ni[i].ns_nid = LNET_NID_ANY; + pi->pi_ni[i].ns_status = LNET_NI_STATUS_INVALID; + } + rcd->rcd_pinginfo = pi; + LNetInvalidateHandle(&rcd->rcd_mdh); + CFS_INIT_LIST_HEAD(&rcd->rcd_list); + + LASSERT (!LNetHandleIsInvalid(the_lnet.ln_rc_eqh)); + rc = LNetMDBind((lnet_md_t){.start = pi, + .user_ptr = rcd, + .length = LNET_PINGINFO_SIZE, + .threshold = LNET_MD_THRESH_INF, + .options = LNET_MD_TRUNCATE, + .eq_handle = the_lnet.ln_rc_eqh}, + LNET_UNLINK, + &rcd->rcd_mdh); + if (rc < 0) { + CERROR("Can't bind MD: %d\n", rc); + lnet_destroy_rc_data(rcd); + return NULL; + } + LASSERT (rc == 0); + return rcd; +} + static int lnet_router_check_interval (lnet_peer_t *rtr) { @@ -607,23 +814,42 @@ lnet_router_check_interval (lnet_peer_t *rtr) static void lnet_ping_router_locked (lnet_peer_t *rtr) { - lnet_process_id_t id; - int secs; - time_t now = cfs_time_current_sec(); + int newrcd = 0; + lnet_rc_data_t *rcd = NULL; + cfs_time_t now = cfs_time_current(); + int secs; lnet_peer_addref_locked(rtr); if (rtr->lp_ping_deadline != 0 && /* ping timed out? */ - now > rtr->lp_ping_deadline) + cfs_time_after(now, rtr->lp_ping_deadline)) lnet_notify_locked(rtr, 1, 0, now); + if (avoid_asym_router_failure && rtr->lp_rcd == NULL) + newrcd = 1; + LNET_UNLOCK(); /* Run any outstanding notifications */ lnet_do_notify(rtr); + if (newrcd) + rcd = lnet_create_rc_data(); + LNET_LOCK(); + if (!lnet_isrouter(rtr)) { + lnet_peer_decref_locked(rtr); + if (rcd != NULL) + list_add(&rcd->rcd_list, &the_lnet.ln_zombie_rcd); + return; /* router table changed! */ + } + + if (rcd != NULL) { + LASSERT (rtr->lp_rcd == NULL); + rtr->lp_rcd = rcd; + } + secs = lnet_router_check_interval(rtr); CDEBUG(D_NET, @@ -634,23 +860,32 @@ lnet_ping_router_locked (lnet_peer_t *rtr) rtr->lp_alive, rtr->lp_alive_count, rtr->lp_ping_timestamp); if (secs != 0 && !rtr->lp_ping_notsent && - now > rtr->lp_ping_timestamp + secs) { + cfs_time_after(now, cfs_time_add(rtr->lp_ping_timestamp, + cfs_time_seconds(secs)))) { + int rc; + lnet_process_id_t id; + lnet_handle_md_t mdh; + id.nid = rtr->lp_nid; id.pid = LUSTRE_SRV_LNET_PID; CDEBUG(D_NET, "Check: %s\n", libcfs_id2str(id)); rtr->lp_ping_notsent = 1; rtr->lp_ping_timestamp = now; + mdh = (rtr->lp_rcd == NULL) ? the_lnet.ln_rc_mdh : + rtr->lp_rcd->rcd_mdh; if (rtr->lp_ping_deadline == 0) - rtr->lp_ping_deadline = now + router_ping_timeout; + rtr->lp_ping_deadline = cfs_time_shift(router_ping_timeout); LNET_UNLOCK(); - LNetGet(LNET_NID_ANY, the_lnet.ln_rc_mdh, id, - LNET_RESERVED_PORTAL, LNET_PROTO_PING_MATCHBITS, 0); + rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL, + LNET_PROTO_PING_MATCHBITS, 0); LNET_LOCK(); + if (rc != 0) + rtr->lp_ping_notsent = 0; /* no event pending */ } lnet_peer_decref_locked(rtr); @@ -732,7 +967,8 @@ lnet_router_checker_start(void) return -EINVAL; } - if (live_router_check_interval <= 0 && + if (!the_lnet.ln_routing && + live_router_check_interval <= 0 && dead_router_check_interval <= 0) return 0; @@ -753,6 +989,7 @@ lnet_router_checker_start(void) } memset(&md, 0, sizeof(md)); + md.user_ptr = NULL; md.start = &pinginfo; md.length = sizeof(pinginfo); md.options = LNET_MD_TRUNCATE; @@ -824,6 +1061,68 @@ lnet_router_checker_stop (void) #if defined(__KERNEL__) && defined(LNET_ROUTER) +static void +lnet_prune_zombie_rcd (int wait_unlink) +{ + lnet_rc_data_t *rcd; + lnet_rc_data_t *tmp; + struct list_head free_rcd; + int i; + __u64 version; + + CFS_INIT_LIST_HEAD(&free_rcd); + + LNET_LOCK(); +rescan: + version = the_lnet.ln_routers_version; + list_for_each_entry_safe (rcd, tmp, &the_lnet.ln_zombie_rcd, rcd_list) { + if (LNetHandleIsInvalid(rcd->rcd_mdh)) { + list_del(&rcd->rcd_list); + list_add(&rcd->rcd_list, &free_rcd); + continue; + } + + LNET_UNLOCK(); + + LNetMDUnlink(rcd->rcd_mdh); + + LNET_LOCK(); + if (version != the_lnet.ln_routers_version) + goto rescan; + } + + i = 2; + while (wait_unlink && !list_empty(&the_lnet.ln_zombie_rcd)) { + rcd = list_entry(the_lnet.ln_zombie_rcd.next, + lnet_rc_data_t, rcd_list); + if (LNetHandleIsInvalid(rcd->rcd_mdh)) { + list_del(&rcd->rcd_list); + list_add(&rcd->rcd_list, &free_rcd); + continue; + } + + LNET_UNLOCK(); + + LNetMDUnlink(rcd->rcd_mdh); + + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, + "Waiting for rc buffers to unlink\n"); + cfs_pause(cfs_time_seconds(1)); + + LNET_LOCK(); + } + + LNET_UNLOCK(); + + while (!list_empty(&free_rcd)) { + rcd = list_entry(free_rcd.next, lnet_rc_data_t, rcd_list); + list_del_init(&rcd->rcd_list); + lnet_destroy_rc_data(rcd); + } + return; +} + static int lnet_router_checker(void *arg) { @@ -859,6 +1158,11 @@ rescan: LNET_UNLOCK(); + if (the_lnet.ln_routing) + lnet_update_ni_status(); + + lnet_prune_zombie_rcd(0); /* don't wait for UNLINK */ + /* Call cfs_pause() here always adds 1 to load average * because kernel counts # active tasks as nr_running * + nr_uninterruptible. */ @@ -866,6 +1170,23 @@ rescan: cfs_time_seconds(1)); } + LNET_LOCK(); + + list_for_each (entry, &the_lnet.ln_routers) { + rtr = list_entry(entry, lnet_peer_t, lp_rtr_list); + + if (rtr->lp_rcd == NULL) + continue; + + LASSERT (list_empty(&rtr->lp_rcd->rcd_list)); + list_add(&rtr->lp_rcd->rcd_list, &the_lnet.ln_zombie_rcd); + rtr->lp_rcd = NULL; + } + + LNET_UNLOCK(); + + lnet_prune_zombie_rcd(1); /* wait for UNLINK */ + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_STOPTHREAD); the_lnet.ln_rc_state = LNET_RC_STATE_UNLINKING; @@ -1079,10 +1400,10 @@ lnet_alloc_rtrpools(int im_a_router) } int -lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, time_t when) +lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when) { - lnet_peer_t *lp = NULL; - time_t now = cfs_time_current_sec(); + lnet_peer_t *lp = NULL; + cfs_time_t now = cfs_time_current(); LASSERT (!in_interrupt ()); @@ -1100,12 +1421,12 @@ lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, time_t when) } /* can't do predictions... */ - if (when > now) { + if (cfs_time_after(when, now)) { CWARN ("Ignoring prediction from %s of %s %s " "%ld seconds in the future\n", (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid), libcfs_nid2str(nid), alive ? "up" : "down", - when - now); + cfs_duration_sec(cfs_time_sub(when, now))); return -EINVAL; } @@ -1156,7 +1477,7 @@ lnet_get_tunables (void) #else int -lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, time_t when) +lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when) { return -EOPNOTSUPP; } diff --git a/lnet/lnet/router_proc.c b/lnet/lnet/router_proc.c index 82a82af5..f04ccc9 100644 --- a/lnet/lnet/router_proc.c +++ b/lnet/lnet/router_proc.c @@ -235,9 +235,9 @@ int LL_PROC_PROTO(proc_lnet_routers) if (*ppos == 0) { s += snprintf(s, tmpstr + tmpsiz - s, - "%-4s %7s %9s %6s %12s %s\n", - "ref", "rtr_ref", "alive_cnt", "state", - "last_ping", "router"); + "%-4s %7s %9s %6s %12s %9s %8s %7s %s\n", + "ref", "rtr_ref", "alive_cnt", "state", "last_ping", + "ping_sent", "deadline", "down_ni", "router"); LASSERT (tmpstr + tmpsiz - s > 0); LNET_LOCK(); @@ -272,18 +272,32 @@ int LL_PROC_PROTO(proc_lnet_routers) } if (peer != NULL) { - int nrefs = peer->lp_refcount; - int nrtrrefs = peer->lp_rtr_refcount; - int alive_cnt = peer->lp_alive_count; - int alive = peer->lp_alive; - time_t last_ping = peer->lp_ping_timestamp; - lnet_nid_t nid = peer->lp_nid; - - s += snprintf(s, tmpstr + tmpsiz - s, - "%-4d %7d %9d %6s %12lu %s\n", - nrefs, nrtrrefs, - alive_cnt, alive ? "up" : "down", - last_ping, libcfs_nid2str(nid)); + lnet_nid_t nid = peer->lp_nid; + cfs_time_t now = cfs_time_current(); + cfs_time_t deadline = peer->lp_ping_deadline; + int nrefs = peer->lp_refcount; + int nrtrrefs = peer->lp_rtr_refcount; + int alive_cnt = peer->lp_alive_count; + int alive = peer->lp_alive; + int pingsent = !peer->lp_ping_notsent; + int last_ping = cfs_duration_sec(now - peer->lp_ping_timestamp); + int down_ni = lnet_router_down_ni(peer, LNET_NIDNET(LNET_NID_ANY)); + + if (deadline == 0) + s += snprintf(s, tmpstr + tmpsiz - s, + "%-4d %7d %9d %6s %12d %9d %8s %7d %s\n", + nrefs, nrtrrefs, alive_cnt, + alive ? "up" : "down", last_ping, + pingsent, "NA", down_ni, + libcfs_nid2str(nid)); + else + s += snprintf(s, tmpstr + tmpsiz - s, + "%-4d %7d %9d %6s %12d %9d %8lu %7d %s\n", + nrefs, nrtrrefs, alive_cnt, + alive ? "up" : "down", last_ping, + pingsent, + cfs_duration_sec(deadline - now), + down_ni, libcfs_nid2str(nid)); LASSERT (tmpstr + tmpsiz - s > 0); } @@ -539,9 +553,9 @@ int LL_PROC_PROTO(proc_lnet_nis) if (*ppos == 0) { s += snprintf(s, tmpstr + tmpsiz - s, - "%-24s %4s %4s %4s %5s %5s %5s\n", - "nid", "refs", "peer", "rtr", "max", - "tx", "min"); + "%-24s %6s %5s %4s %4s %4s %5s %5s %5s\n", + "nid", "status", "alive", "refs", "peer", + "rtr", "max", "tx", "min"); LASSERT (tmpstr + tmpsiz - s > 0); } else { struct list_head *n; @@ -565,6 +579,7 @@ int LL_PROC_PROTO(proc_lnet_nis) } if (ni != NULL) { + cfs_time_t now = cfs_time_current(); int maxtxcr = ni->ni_maxtxcredits; int txcr = ni->ni_txcredits; int mintxcr = ni->ni_mintxcredits; @@ -572,10 +587,21 @@ int LL_PROC_PROTO(proc_lnet_nis) int npeerrtrcr = ni->ni_peerrtrcredits; lnet_nid_t nid = ni->ni_nid; int nref = ni->ni_refcount; + int last_alive; + char *stat; + + last_alive = (the_lnet.ln_routing) ? + cfs_duration_sec(now - ni->ni_last_alive) : -1; + if (ni->ni_lnd->lnd_type == LOLND) /* @lo forever alive */ + last_alive = 0; + + LASSERT (ni->ni_status != NULL); + stat = (ni->ni_status->ns_status == LNET_NI_STATUS_UP) ? + "up" : "down"; s += snprintf(s, tmpstr + tmpsiz - s, - "%-24s %4d %4d %4d %5d %5d %5d\n", - libcfs_nid2str(nid), nref, + "%-24s %6s %5d %4d %4d %4d %5d %5d %5d\n", + libcfs_nid2str(nid), stat, last_alive, nref, npeertxcr, npeerrtrcr, maxtxcr, txcr, mintxcr); LASSERT (tmpstr + tmpsiz - s > 0);