From 2c56a64e065f471057928f16b9b550ca0050bdc1 Mon Sep 17 00:00:00 2001 From: isaac Date: Wed, 8 Jul 2009 19:33:31 +0000 Subject: [PATCH] i=liang,b=13065: - port router pinger to userspace. --- lnet/ChangeLog | 4 + lnet/include/lnet/lib-lnet.h | 3 + lnet/include/lnet/lib-types.h | 8 +- lnet/lnet/api-ni.c | 2 + lnet/lnet/lib-eq.c | 10 + lnet/lnet/lib-move.c | 13 + lnet/lnet/router.c | 707 ++++++++++++++++++++++++++---------------- lnet/ulnds/ptllnd/ptllnd.c | 1 - lnet/ulnds/ptllnd/ptllnd.h | 1 - lnet/ulnds/ptllnd/ptllnd_cb.c | 84 +++-- 10 files changed, 512 insertions(+), 321 deletions(-) diff --git a/lnet/ChangeLog b/lnet/ChangeLog index 0d739c9..c8f741e 100644 --- a/lnet/ChangeLog +++ b/lnet/ChangeLog @@ -17,6 +17,10 @@ Bugzilla : Description: Details : +Severity : enhancement +Bugzilla : 13065 +Description: port router pinger to userspace + Severity : normal Bugzilla : 17546 Description: kptllnd HELLO protocol deadlock diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index cf04b09..87a29c7 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -658,6 +658,8 @@ void lnet_connect_console_error(int rc, lnet_nid_t peer_nid, int lnet_count_acceptor_nis(void); int lnet_acceptor_timeout(void); int lnet_acceptor_port(void); +#else +void lnet_router_checker(void); #endif #ifdef HAVE_LIBPTHREAD @@ -668,6 +670,7 @@ int lnet_acceptor_port(void); int lnet_acceptor_start(void); void lnet_acceptor_stop(void); +void lnet_get_tunables(void); int lnet_peers_start_down(void); int lnet_peer_buffer_credits(lnet_ni_t *ni); int lnet_router_checker_start(void); diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index bba2881..8e5f1a0 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -486,8 +486,7 @@ typedef struct { unsigned int ptl_options; } lnet_portal_t; -/* Router Checker */ -/* < 0 == startup error */ +/* Router Checker states */ #define LNET_RC_STATE_SHUTDOWN 0 /* not started */ #define LNET_RC_STATE_RUNNING 1 /* started up OK */ #define LNET_RC_STATE_STOPTHREAD 2 /* telling thread to stop */ @@ -572,10 +571,11 @@ typedef struct lnet_ping_info_t *ln_ping_info; #ifdef __KERNEL__ - int ln_rc_state; /* router checker startup/shutdown state */ struct semaphore ln_rc_signal; /* serialise startup/shutdown */ - lnet_handle_eq_t ln_rc_eqh; /* router checker's event queue */ #endif + int ln_rc_state; /* router checker startup/shutdown state */ + lnet_handle_eq_t ln_rc_eqh; /* router checker's event queue */ + lnet_handle_md_t ln_rc_mdh; #ifdef LNET_USE_LIB_FREELIST lnet_freelist_t ln_free_mes; diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index fac8296..a7a4f2b 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -1172,6 +1172,8 @@ LNetNIInit(lnet_pid_t requested_pid) goto out; } + lnet_get_tunables(); + if (requested_pid == LNET_PID_ANY) { /* Don't instantiate LNET just for me */ rc = -ENETDOWN; diff --git a/lnet/lnet/lib-eq.c b/lnet/lnet/lib-eq.c index f24758d..701352c 100644 --- a/lnet/lnet/lib-eq.c +++ b/lnet/lnet/lib-eq.c @@ -219,6 +219,16 @@ LNetEQPoll (lnet_handle_eq_t *eventqs, int neq, int timeout_ms, LNET_LOCK(); for (;;) { +#ifndef __KERNEL__ + LNET_UNLOCK(); + + /* Recursion breaker */ + if (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING && + !LNetHandleIsEqual(eventqs[0], the_lnet.ln_rc_eqh)) + lnet_router_checker(); + + LNET_LOCK(); +#endif for (i = 0; i < neq; i++) { lnet_eq_t *eq = lnet_handle2eq(&eventqs[i]); diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index a0bebbf..f31b0fe 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -1361,6 +1361,19 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg) } LASSERT (lp->lp_ni == src_ni); } else { +#ifndef __KERNEL__ + LNET_UNLOCK(); + + /* NB + * - once application finishes computation, check here to update + * router states before it waits for pending IO in LNetEQPoll + * - recursion breaker: router checker sends no message + * to remote networks */ + if (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) + lnet_router_checker(); + + LNET_LOCK(); +#endif /* sending to a remote network */ rnet = lnet_find_net_locked(LNET_NIDNET(dst_nid)); if (rnet == NULL) { diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index 42df3a2..9714c21 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -47,6 +47,32 @@ static int auto_down = 1; CFS_MODULE_PARM(auto_down, "i", int, 0444, "Automatically mark peers down on comms error"); +int +lnet_peer_buffer_credits(lnet_ni_t *ni) +{ + /* NI option overrides LNet default */ + if (ni->ni_peerrtrcredits > 0) + return ni->ni_peerrtrcredits; + if (peer_buffer_credits > 0) + return peer_buffer_credits; + + /* As an approximation, allow this peer the same number of router + * buffers as it is allowed outstanding sends */ + return ni->ni_peertxcredits; +} + +/* forward ref's */ +static int lnet_router_checker(void *); +#else + +int +lnet_peer_buffer_credits(lnet_ni_t *ni) +{ + return 0; +} + +#endif + static int check_routers_before_use = 0; CFS_MODULE_PARM(check_routers_before_use, "i", int, 0444, "Assume routers are down and ping them before use"); @@ -69,20 +95,6 @@ lnet_peers_start_down(void) return check_routers_before_use; } -int -lnet_peer_buffer_credits(lnet_ni_t *ni) -{ - /* NI option overrides LNet default */ - if (ni->ni_peerrtrcredits > 0) - return ni->ni_peerrtrcredits; - if (peer_buffer_credits > 0) - return peer_buffer_credits; - - /* As an approximation, allow this peer the same number of router - * buffers as it is allowed outstanding sends */ - return ni->ni_peertxcredits; -} - void lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, time_t when) { @@ -154,90 +166,6 @@ lnet_do_notify (lnet_peer_t *lp) LNET_UNLOCK(); } -int -lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, time_t when) -{ - lnet_peer_t *lp = NULL; - time_t now = cfs_time_current_sec(); - - LASSERT (!in_interrupt ()); - - CDEBUG (D_NET, "%s notifying %s: %s\n", - (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid), - libcfs_nid2str(nid), - alive ? "up" : "down"); - - if (ni != NULL && - LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) { - CWARN ("Ignoring notification of %s %s by %s (different net)\n", - libcfs_nid2str(nid), alive ? "birth" : "death", - libcfs_nid2str(ni->ni_nid)); - return -EINVAL; - } - - /* can't do predictions... */ - if (when > now) { - CWARN ("Ignoring prediction from %s of %s %s " - "%ld seconds in the future\n", - (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid), - libcfs_nid2str(nid), alive ? "up" : "down", - when - now); - return -EINVAL; - } - - if (ni != NULL && !alive && /* LND telling me she's down */ - !auto_down) { /* auto-down disabled */ - CDEBUG(D_NET, "Auto-down disabled\n"); - return 0; - } - - LNET_LOCK(); - - lp = lnet_find_peer_locked(nid); - if (lp == NULL) { - /* nid not found */ - LNET_UNLOCK(); - CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid)); - return 0; - } - - /* We can't fully trust LND on reporting exact peer last_alive - * if he notifies us about dead peer. For example ksocklnd can - * call us with when == _time_when_the_node_was_booted_ if - * no connections were successfully established */ - if (ni != NULL && !alive && when < lp->lp_last_alive) - when = lp->lp_last_alive; - - lnet_notify_locked(lp, ni == NULL, alive, when); - - LNET_UNLOCK(); - - lnet_do_notify(lp); - - LNET_LOCK(); - - lnet_peer_decref_locked(lp); - - LNET_UNLOCK(); - return 0; -} -EXPORT_SYMBOL(lnet_notify); - -#else - -int -lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, time_t when) -{ - return -EOPNOTSUPP; -} - -void -lnet_notify_locked (lnet_peer_t *lp, int notifylnd, int alive, time_t when) -{ - return; -} - -#endif static void lnet_rtr_addref_locked(lnet_peer_t *lp) @@ -567,7 +495,40 @@ lnet_get_route (int idx, __u32 *net, __u32 *hops, return -ENOENT; } -#if defined(__KERNEL__) && defined(LNET_ROUTER) +void +lnet_wait_known_routerstate(void) +{ + lnet_peer_t *rtr; + struct list_head *entry; + int all_known; + + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); + + for (;;) { + LNET_LOCK(); + + all_known = 1; + list_for_each (entry, &the_lnet.ln_routers) { + rtr = list_entry(entry, lnet_peer_t, lp_rtr_list); + + if (rtr->lp_alive_count == 0) { + all_known = 0; + break; + } + } + + LNET_UNLOCK(); + + if (all_known) + return; + +#ifndef __KERNEL__ + lnet_router_checker(); +#endif + cfs_pause(cfs_time_seconds(1)); + } +} + static void lnet_router_checker_event (lnet_event_t *event) { @@ -582,7 +543,9 @@ lnet_router_checker_event (lnet_event_t *event) * and exited. */ LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_UNLINKING); the_lnet.ln_rc_state = LNET_RC_STATE_UNLINKED; +#ifdef __KERNEL__ mutex_up(&the_lnet.ln_rc_signal); +#endif return; } @@ -629,187 +592,135 @@ lnet_router_checker_event (lnet_event_t *event) } static int -lnet_router_checker(void *arg) +lnet_router_check_interval (lnet_peer_t *rtr) { - static lnet_ping_info_t pinginfo; - - int rc; - lnet_handle_md_t mdh; - lnet_peer_t *rtr; - lnet_md_t md = {0}; - struct list_head *entry; - time_t now; - lnet_process_id_t rtr_id; - int secs; - - cfs_daemonize("router_checker"); - cfs_block_allsigs(); - - rtr_id.pid = LUSTRE_SRV_LNET_PID; - - LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN); - - /* initialize md content */ - md.start = &pinginfo; - md.length = sizeof(pinginfo); - md.threshold = LNET_MD_THRESH_INF; - md.max_size = 0; - md.options = LNET_MD_TRUNCATE, - md.user_ptr = NULL; - md.eq_handle = the_lnet.ln_rc_eqh; - - rc = LNetMDBind(md, LNET_UNLINK, &mdh); - - if (rc < 0) { - CERROR("Can't bind MD: %d\n", rc); - the_lnet.ln_rc_state = rc; - mutex_up(&the_lnet.ln_rc_signal); - return rc; - } - - LASSERT (rc == 0); + int secs; - the_lnet.ln_rc_state = LNET_RC_STATE_RUNNING; - mutex_up(&the_lnet.ln_rc_signal); /* let my parent go */ - - while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) { - __u64 version; - - LNET_LOCK(); -rescan: - version = the_lnet.ln_routers_version; + secs = rtr->lp_alive ? live_router_check_interval : + dead_router_check_interval; + if (secs < 0) + secs = 0; - list_for_each (entry, &the_lnet.ln_routers) { - rtr = list_entry(entry, lnet_peer_t, lp_rtr_list); - - lnet_peer_addref_locked(rtr); - - now = cfs_time_current_sec(); + return secs; +} - if (rtr->lp_ping_deadline != 0 && /* ping timed out? */ - now > rtr->lp_ping_deadline) - lnet_notify_locked(rtr, 1, 0, now); +static void +lnet_ping_router_locked (lnet_peer_t *rtr) +{ + lnet_process_id_t id; + int secs; + time_t now = cfs_time_current_sec(); - LNET_UNLOCK(); + lnet_peer_addref_locked(rtr); - /* Run any outstanding notificiations */ - lnet_do_notify(rtr); + if (rtr->lp_ping_deadline != 0 && /* ping timed out? */ + now > rtr->lp_ping_deadline) + lnet_notify_locked(rtr, 1, 0, now); - if (rtr->lp_alive) { - secs = live_router_check_interval; - } else { - secs = dead_router_check_interval; - } - if (secs <= 0) - secs = 0; + LNET_UNLOCK(); - if (secs != 0 && - !rtr->lp_ping_notsent && - now > rtr->lp_ping_timestamp + secs) { - CDEBUG(D_NET, "Check: %s\n", - libcfs_nid2str(rtr->lp_nid)); + /* Run any outstanding notifications */ + lnet_do_notify(rtr); - LNET_LOCK(); - rtr_id.nid = rtr->lp_nid; - rtr->lp_ping_notsent = 1; - rtr->lp_ping_timestamp = now; + LNET_LOCK(); - if (rtr->lp_ping_deadline == 0) - rtr->lp_ping_deadline = - now + router_ping_timeout; + secs = lnet_router_check_interval(rtr); - LNET_UNLOCK(); + CDEBUG(D_NET, + "rtr %s %d: deadline %lu ping_notsent %d alive %d " + "alive_count %d lp_ping_timestamp %lu\n", + libcfs_nid2str(rtr->lp_nid), secs, + rtr->lp_ping_deadline, rtr->lp_ping_notsent, + rtr->lp_alive, rtr->lp_alive_count, rtr->lp_ping_timestamp); - LNetGet(LNET_NID_ANY, mdh, rtr_id, - LNET_RESERVED_PORTAL, - LNET_PROTO_PING_MATCHBITS, 0); - } + if (secs != 0 && !rtr->lp_ping_notsent && + now > rtr->lp_ping_timestamp + secs) { + id.nid = rtr->lp_nid; + id.pid = LUSTRE_SRV_LNET_PID; + CDEBUG(D_NET, "Check: %s\n", libcfs_id2str(id)); - LNET_LOCK(); - lnet_peer_decref_locked(rtr); + rtr->lp_ping_notsent = 1; + rtr->lp_ping_timestamp = now; - if (version != the_lnet.ln_routers_version) { - /* the routers list has changed */ - goto rescan; - } - } + if (rtr->lp_ping_deadline == 0) + rtr->lp_ping_deadline = now + router_ping_timeout; LNET_UNLOCK(); - /* Call cfs_pause() here always adds 1 to load average - * because kernel counts # active tasks as nr_running - * + nr_uninterruptible. */ - cfs_schedule_timeout(CFS_TASK_INTERRUPTIBLE, - cfs_time_seconds(1)); - } - - LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_STOPTHREAD); - the_lnet.ln_rc_state = LNET_RC_STATE_UNLINKING; + LNetGet(LNET_NID_ANY, the_lnet.ln_rc_mdh, id, + LNET_RESERVED_PORTAL, LNET_PROTO_PING_MATCHBITS, 0); - rc = LNetMDUnlink(mdh); - LASSERT (rc == 0); + LNET_LOCK(); + } - /* The unlink event callback will signal final completion */ - return 0; + lnet_peer_decref_locked(rtr); + return; } - -void -lnet_wait_known_routerstate(void) +int +lnet_router_checker_start(void) { - lnet_peer_t *rtr; - struct list_head *entry; - int all_known; + static lnet_ping_info_t pinginfo; - for (;;) { - LNET_LOCK(); + lnet_md_t md; + int rc; + int eqsz; +#ifndef __KERNEL__ + lnet_peer_t *rtr; + __u64 version; + int nrtr = 0; + int router_checker_max_eqsize = 10240; - all_known = 1; - list_for_each (entry, &the_lnet.ln_routers) { - rtr = list_entry(entry, lnet_peer_t, lp_rtr_list); - - if (rtr->lp_alive_count == 0) { - all_known = 0; - break; - } - } + LASSERT (check_routers_before_use); + LASSERT (dead_router_check_interval > 0); - LNET_UNLOCK(); + LNET_LOCK(); - if (all_known) - return; + /* As an approximation, allow each router the same number of + * outstanding events as it is allowed outstanding sends */ + eqsz = 0; + version = the_lnet.ln_routers_version; + list_for_each_entry(rtr, &the_lnet.ln_routers, lp_rtr_list) { + lnet_ni_t *ni = rtr->lp_ni; + lnet_process_id_t id; - cfs_pause(cfs_time_seconds(1)); - } -} + nrtr++; + eqsz += ni->ni_peertxcredits; -void -lnet_router_checker_stop(void) -{ - int rc; + /* one async ping reply per router */ + id.nid = rtr->lp_nid; + id.pid = LUSTRE_SRV_LNET_PID; - LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING || - the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN); + LNET_UNLOCK(); - if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN) - return; + rc = LNetSetAsync(id, 1); + if (rc != 0) { + CWARN("LNetSetAsync %s failed: %d\n", + libcfs_id2str(id), rc); + return rc; + } - the_lnet.ln_rc_state = LNET_RC_STATE_STOPTHREAD; - /* block until event callback signals exit */ - mutex_down(&the_lnet.ln_rc_signal); + LNET_LOCK(); + /* NB router list doesn't change in userspace */ + LASSERT (version == the_lnet.ln_routers_version); + } - LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_UNLINKED); + LNET_UNLOCK(); - rc = LNetEQFree(the_lnet.ln_rc_eqh); - LASSERT (rc == 0); + if (nrtr == 0) { + CDEBUG(D_NET, + "No router found, not starting router checker\n"); + return 0; + } - the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN; -} + /* at least allow a SENT and a REPLY per router */ + if (router_checker_max_eqsize < 2 * nrtr) + router_checker_max_eqsize = 2 * nrtr; -int -lnet_router_checker_start(void) -{ - int rc; + LASSERT (eqsz > 0); + if (eqsz > router_checker_max_eqsize) + eqsz = router_checker_max_eqsize; +#endif LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN); @@ -825,32 +736,53 @@ lnet_router_checker_start(void) dead_router_check_interval <= 0) return 0; +#ifdef __KERNEL__ init_mutex_locked(&the_lnet.ln_rc_signal); - /* EQ size doesn't matter; the callback is guaranteed to get every * event */ - rc = LNetEQAlloc(1, lnet_router_checker_event, + eqsz = 1; + rc = LNetEQAlloc(eqsz, lnet_router_checker_event, + &the_lnet.ln_rc_eqh); +#else + rc = LNetEQAlloc(eqsz, LNET_EQ_HANDLER_NONE, &the_lnet.ln_rc_eqh); +#endif if (rc != 0) { - CERROR("Can't allocate EQ: %d\n", rc); + CERROR("Can't allocate EQ(%d): %d\n", eqsz, rc); return -ENOMEM; } - rc = (int)cfs_kernel_thread(lnet_router_checker, NULL, 0); + memset(&md, 0, sizeof(md)); + md.start = &pinginfo; + md.length = sizeof(pinginfo); + md.options = LNET_MD_TRUNCATE; + md.threshold = LNET_MD_THRESH_INF; + md.eq_handle = the_lnet.ln_rc_eqh; + rc = LNetMDBind(md, LNET_UNLINK, &the_lnet.ln_rc_mdh); if (rc < 0) { - CERROR("Can't start router checker thread: %d\n", rc); - goto failed; + CERROR("Can't bind MD: %d\n", rc); + rc = LNetEQFree(the_lnet.ln_rc_eqh); + LASSERT (rc == 0); + return -ENOMEM; } + LASSERT (rc == 0); - mutex_down(&the_lnet.ln_rc_signal); /* wait for checker to startup */ - - rc = the_lnet.ln_rc_state; + the_lnet.ln_rc_state = LNET_RC_STATE_RUNNING; +#ifdef __KERNEL__ + rc = (int)cfs_kernel_thread(lnet_router_checker, NULL, 0); if (rc < 0) { + CERROR("Can't start router checker thread: %d\n", rc); + the_lnet.ln_rc_state = LNET_RC_STATE_UNLINKING; + rc = LNetMDUnlink(the_lnet.ln_rc_mdh); + LASSERT (rc == 0); + /* block until event callback signals exit */ + mutex_down(&the_lnet.ln_rc_signal); + rc = LNetEQFree(the_lnet.ln_rc_eqh); + LASSERT (rc == 0); the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN; - goto failed; + return -ENOMEM; } - - LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); +#endif if (check_routers_before_use) { /* Note that a helpful side-effect of pinging all known routers @@ -860,11 +792,88 @@ lnet_router_checker_start(void) } return 0; +} + +void +lnet_router_checker_stop (void) +{ + int rc; + + if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN) + return; + + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); + the_lnet.ln_rc_state = LNET_RC_STATE_STOPTHREAD; + +#ifdef __KERNEL__ + /* block until event callback signals exit */ + mutex_down(&the_lnet.ln_rc_signal); +#else + while (the_lnet.ln_rc_state != LNET_RC_STATE_UNLINKED) { + lnet_router_checker(); + cfs_pause(cfs_time_seconds(1)); + } +#endif + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_UNLINKED); - failed: rc = LNetEQFree(the_lnet.ln_rc_eqh); LASSERT (rc == 0); - return rc; + the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN; + return; +} + +#if defined(__KERNEL__) && defined(LNET_ROUTER) + +static int +lnet_router_checker(void *arg) +{ + int rc; + lnet_peer_t *rtr; + struct list_head *entry; + lnet_process_id_t rtr_id; + + cfs_daemonize("router_checker"); + cfs_block_allsigs(); + + rtr_id.pid = LUSTRE_SRV_LNET_PID; + + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); + + while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) { + __u64 version; + + LNET_LOCK(); +rescan: + version = the_lnet.ln_routers_version; + + list_for_each (entry, &the_lnet.ln_routers) { + rtr = list_entry(entry, lnet_peer_t, lp_rtr_list); + lnet_ping_router_locked(rtr); + + /* NB dropped lock */ + if (version != the_lnet.ln_routers_version) { + /* the routers list has changed */ + goto rescan; + } + } + + LNET_UNLOCK(); + + /* Call cfs_pause() here always adds 1 to load average + * because kernel counts # active tasks as nr_running + * + nr_uninterruptible. */ + cfs_schedule_timeout(CFS_TASK_INTERRUPTIBLE, + cfs_time_seconds(1)); + } + + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_STOPTHREAD); + the_lnet.ln_rc_state = LNET_RC_STATE_UNLINKING; + + rc = LNetMDUnlink(the_lnet.ln_rc_mdh); + LASSERT (rc == 0); + + /* The unlink event callback will signal final completion */ + return 0; } void @@ -1069,30 +1078,192 @@ lnet_alloc_rtrpools(int im_a_router) return rc; } -#else - int -lnet_peers_start_down(void) +lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, time_t when) { + lnet_peer_t *lp = NULL; + time_t now = cfs_time_current_sec(); + + LASSERT (!in_interrupt ()); + + CDEBUG (D_NET, "%s notifying %s: %s\n", + (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid), + libcfs_nid2str(nid), + alive ? "up" : "down"); + + if (ni != NULL && + LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) { + CWARN ("Ignoring notification of %s %s by %s (different net)\n", + libcfs_nid2str(nid), alive ? "birth" : "death", + libcfs_nid2str(ni->ni_nid)); + return -EINVAL; + } + + /* can't do predictions... */ + if (when > now) { + CWARN ("Ignoring prediction from %s of %s %s " + "%ld seconds in the future\n", + (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid), + libcfs_nid2str(nid), alive ? "up" : "down", + when - now); + return -EINVAL; + } + + if (ni != NULL && !alive && /* LND telling me she's down */ + !auto_down) { /* auto-down disabled */ + CDEBUG(D_NET, "Auto-down disabled\n"); + return 0; + } + + LNET_LOCK(); + + lp = lnet_find_peer_locked(nid); + if (lp == NULL) { + /* nid not found */ + LNET_UNLOCK(); + CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid)); + return 0; + } + + /* We can't fully trust LND on reporting exact peer last_alive + * if he notifies us about dead peer. For example ksocklnd can + * call us with when == _time_when_the_node_was_booted_ if + * no connections were successfully established */ + if (ni != NULL && !alive && when < lp->lp_last_alive) + when = lp->lp_last_alive; + + lnet_notify_locked(lp, ni == NULL, alive, when); + + LNET_UNLOCK(); + + lnet_do_notify(lp); + + LNET_LOCK(); + + lnet_peer_decref_locked(lp); + + LNET_UNLOCK(); return 0; } +EXPORT_SYMBOL(lnet_notify); + +void +lnet_get_tunables (void) +{ + return; +} + +#else int -lnet_peer_buffer_credits(lnet_ni_t *ni) +lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, time_t when) { - return 0; + return -EOPNOTSUPP; } void -lnet_router_checker_stop(void) +lnet_router_checker (void) { + static time_t last = 0; + static int running = 0; + + time_t now = cfs_time_current_sec(); + int interval = now - last; + int rc; + __u64 version; + lnet_peer_t *rtr; + + /* It's no use to call me again within a sec - all intervals and + * timeouts are measured in seconds */ + if (last != 0 && interval < 2) + return; + + if (last != 0 && + interval > MAX(live_router_check_interval, + dead_router_check_interval)) + CDEBUG(D_NETERROR, "Checker(%d/%d) not called for %d seconds\n", + live_router_check_interval, dead_router_check_interval, + interval); + + LNET_LOCK(); + LASSERT (!running); /* recursion check */ + running = 1; + LNET_UNLOCK(); + + last = now; + + if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPTHREAD) { + the_lnet.ln_rc_state = LNET_RC_STATE_UNLINKING; + rc = LNetMDUnlink(the_lnet.ln_rc_mdh); + LASSERT (rc == 0); + } + + /* consume all pending events */ + while (1) { + int i; + lnet_event_t ev; + + /* NB ln_rc_eqh must be the 1st in 'eventqs' otherwise the + * recursion breaker in LNetEQPoll would fail */ + rc = LNetEQPoll(&the_lnet.ln_rc_eqh, 1, 0, &ev, &i); + if (rc == 0) /* no event pending */ + break; + + /* NB a lost SENT prevents me from pinging a router again */ + if (rc == -EOVERFLOW) { + CERROR("Dropped an event!!!\n"); + abort(); + } + + LASSERT (rc == 1); + + LNET_LOCK(); + lnet_router_checker_event(&ev); + LNET_UNLOCK(); + } + + if (the_lnet.ln_rc_state == LNET_RC_STATE_UNLINKED || + the_lnet.ln_rc_state == LNET_RC_STATE_UNLINKING) { + running = 0; + return; + } + + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); + + LNET_LOCK(); + + version = the_lnet.ln_routers_version; + list_for_each_entry (rtr, &the_lnet.ln_routers, lp_rtr_list) { + lnet_ping_router_locked(rtr); + LASSERT (version == the_lnet.ln_routers_version); + } + + LNET_UNLOCK(); + + running = 0; /* lock only needed for the recursion check */ return; } -int -lnet_router_checker_start(void) +/* NB lnet_peers_start_down depends on me, + * so must be called before any peer creation */ +void +lnet_get_tunables (void) { - return 0; + char *s; + + s = getenv("LNET_ROUTER_PING_TIMEOUT"); + if (s != NULL) router_ping_timeout = atoi(s); + + s = getenv("LNET_LIVE_ROUTER_CHECK_INTERVAL"); + if (s != NULL) live_router_check_interval = atoi(s); + + s = getenv("LNET_DEAD_ROUTER_CHECK_INTERVAL"); + if (s != NULL) dead_router_check_interval = atoi(s); + + /* This replaces old lnd_notify mechanism */ + check_routers_before_use = 1; + if (dead_router_check_interval <= 0) + dead_router_check_interval = 30; } void diff --git a/lnet/ulnds/ptllnd/ptllnd.c b/lnet/ulnds/ptllnd/ptllnd.c index c0fc41a..0b70023 100644 --- a/lnet/ulnds/ptllnd/ptllnd.c +++ b/lnet/ulnds/ptllnd/ptllnd.c @@ -48,7 +48,6 @@ lnd_t the_ptllnd = { .lnd_send = ptllnd_send, .lnd_recv = ptllnd_recv, .lnd_eager_recv = ptllnd_eager_recv, - .lnd_notify = ptllnd_notify, .lnd_wait = ptllnd_wait, .lnd_setasync = ptllnd_setasync, }; diff --git a/lnet/ulnds/ptllnd/ptllnd.h b/lnet/ulnds/ptllnd/ptllnd.h index b8198b2..2ad730e 100644 --- a/lnet/ulnds/ptllnd/ptllnd.h +++ b/lnet/ulnds/ptllnd/ptllnd.h @@ -238,7 +238,6 @@ int ptllnd_eager_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, void **new_privatep); ptllnd_tx_t *ptllnd_new_tx(ptllnd_peer_t *peer, int type, int payload_nob); -void ptllnd_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive); int ptllnd_setasync(lnet_ni_t *ni, lnet_process_id_t id, int n); void ptllnd_wait(lnet_ni_t *ni, int milliseconds); void ptllnd_check_sends(ptllnd_peer_t *peer); diff --git a/lnet/ulnds/ptllnd/ptllnd_cb.c b/lnet/ulnds/ptllnd/ptllnd_cb.c index 9cfa071..6a41072 100644 --- a/lnet/ulnds/ptllnd/ptllnd_cb.c +++ b/lnet/ulnds/ptllnd/ptllnd_cb.c @@ -123,7 +123,7 @@ ptllnd_close_peer(ptllnd_peer_t *peer, int error) !list_empty(&peer->plp_noopq) || !list_empty(&peer->plp_activeq) || error != 0) { - CWARN("Closing %s\n", libcfs_id2str(peer->plp_id)); + CWARN("Closing %s: %d\n", libcfs_id2str(peer->plp_id), error); if (plni->plni_debug) ptllnd_dump_debug(ni, peer->plp_id); } @@ -341,40 +341,6 @@ ptllnd_dump_debug(lnet_ni_t *ni, lnet_process_id_t id) ptllnd_dump_history(); } -void -ptllnd_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive) -{ - lnet_process_id_t id; - ptllnd_peer_t *peer; - time_t start = cfs_time_current_sec(); - ptllnd_ni_t *plni = ni->ni_data; - int w = plni->plni_long_wait; - - /* This is only actually used to connect to routers at startup! */ - LASSERT(alive); - - id.nid = nid; - id.pid = LUSTRE_SRV_LNET_PID; - - peer = ptllnd_find_peer(ni, id, 1); - if (peer == NULL) - return; - - /* wait for the peer to reply */ - while (!peer->plp_recvd_hello) { - if (w > 0 && cfs_time_current_sec() > start + w/1000) { - CWARN("Waited %ds to connect to %s\n", - (int)(cfs_time_current_sec() - start), - libcfs_id2str(id)); - w *= 2; - } - - ptllnd_wait(ni, w); - } - - ptllnd_peer_decref(peer); -} - int ptllnd_setasync(lnet_ni_t *ni, lnet_process_id_t id, int nasync) { @@ -518,7 +484,7 @@ ptllnd_new_tx(ptllnd_peer_t *peer, int type, int payload_nob) ptllnd_peer_addref(peer); plni->plni_ntxs++; - CDEBUG(D_NET, "tx=%p\n",tx); + CDEBUG(D_NET, "tx=%p\n", tx); return tx; } @@ -859,7 +825,7 @@ ptllnd_check_sends(ptllnd_peer_t *peer) list_add_tail(&tx->tx_list, &peer->plp_activeq); CDEBUG(D_NET, "Sending at TX=%p type=%s (%d)\n",tx, - ptllnd_msgtype2str(tx->tx_type),tx->tx_type); + ptllnd_msgtype2str(tx->tx_type),tx->tx_type); if (tx->tx_type == PTLLND_MSG_TYPE_NOOP && !ptllnd_peer_send_noop(peer)) { @@ -960,15 +926,14 @@ ptllnd_passive_rdma(ptllnd_peer_t *peer, int type, lnet_msg_t *msg, if (tx == NULL) { CERROR("Can't allocate %s tx for %s\n", - type == PTLLND_MSG_TYPE_GET ? "GET" : "PUT/REPLY", - libcfs_id2str(peer->plp_id)); + ptllnd_msgtype2str(type), libcfs_id2str(peer->plp_id)); return -ENOMEM; } rc = ptllnd_set_txiov(tx, niov, iov, offset, len); if (rc != 0) { - CERROR ("Can't allocate iov %d for %s\n", - niov, libcfs_id2str(peer->plp_id)); + CERROR("Can't allocate iov %d for %s\n", + niov, libcfs_id2str(peer->plp_id)); rc = -ENOMEM; goto failed; } @@ -986,12 +951,24 @@ ptllnd_passive_rdma(ptllnd_peer_t *peer, int type, lnet_msg_t *msg, start = cfs_time_current_sec(); w = plni->plni_long_wait; + ptllnd_set_tx_deadline(tx); - while (!peer->plp_recvd_hello) { /* wait to validate plp_match */ + while (!peer->plp_recvd_hello) { /* wait to validate plp_match */ if (peer->plp_closing) { rc = -EIO; goto failed; } + + /* NB must check here to avoid unbounded wait - tx not yet + * on peer->plp_txq, so ptllnd_watchdog can't expire it */ + if (tx->tx_deadline < cfs_time_current_sec()) { + CERROR("%s tx for %s timed out\n", + ptllnd_msgtype2str(type), + libcfs_id2str(peer->plp_id)); + rc = -ETIMEDOUT; + goto failed; + } + if (w > 0 && cfs_time_current_sec() > start + w/1000) { CWARN("Waited %ds to connect to %s\n", (int)(cfs_time_current_sec() - start), @@ -1065,6 +1042,7 @@ ptllnd_passive_rdma(ptllnd_peer_t *peer, int type, lnet_msg_t *msg, return 0; failed: + tx->tx_status = rc; ptllnd_tx_done(tx); return rc; } @@ -1095,8 +1073,8 @@ ptllnd_active_rdma(ptllnd_peer_t *peer, int type, rc = ptllnd_set_txiov(tx, niov, iov, offset, len); if (rc != 0) { - CERROR ("Can't allocate iov %d for %s\n", - niov, libcfs_id2str(peer->plp_id)); + CERROR("Can't allocate iov %d for %s\n", + niov, libcfs_id2str(peer->plp_id)); rc = -ENOMEM; goto failed; } @@ -1427,6 +1405,13 @@ ptllnd_parse_request(lnet_ni_t *ni, ptl_process_id_t initiator, if (plni->plni_abort_on_nak) abort(); + plp = ptllnd_find_peer(ni, srcid, 0); + if (plp == NULL) { + CERROR("Ignore NAK from %s: no peer\n", libcfs_id2str(srcid)); + return; + } + ptllnd_close_peer(plp, -EPROTO); + ptllnd_peer_decref(plp); return; } @@ -1610,8 +1595,8 @@ ptllnd_buf_event (lnet_ni_t *ni, ptl_event_t *event) /* Portals can't force message alignment - someone sending an * odd-length message could misalign subsequent messages */ if ((event->mlength & 7) != 0) { - CERROR("Message from %s has odd length %u: " - "probable version incompatibility\n", + CERROR("Message from %s has odd length "LPU64 + " probable version incompatibility\n", ptllnd_ptlid2str(event->initiator), event->mlength); LBUG(); @@ -1786,7 +1771,12 @@ ptllnd_check_peer(ptllnd_peer_t *peer) if (tx == NULL) return; - CERROR("%s: timed out\n", libcfs_id2str(peer->plp_id)); + CERROR("%s (sent %d recvd %d, credits %d/%d/%d/%d/%d): timed out %p %p\n", + libcfs_id2str(peer->plp_id), peer->plp_sent_hello, peer->plp_recvd_hello, + peer->plp_credits, peer->plp_outstanding_credits, + peer->plp_sent_credits, peer->plp_lazy_credits, + peer->plp_extra_lazy_credits, tx, tx->tx_lnetmsg); + ptllnd_debug_tx(tx); ptllnd_close_peer(peer, -ETIMEDOUT); } -- 1.8.3.1