From: Amir Shehata Date: Thu, 11 Dec 2014 18:52:26 +0000 (-0800) Subject: LU-6003 lnet: improvement to router checker X-Git-Tag: 2.6.94~43 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=dc7f34c31466c7e4920ce036440ac79bd4ba22e1 LU-6003 lnet: improvement to router checker This patch starts router checker thread all the time. The router checker only checks routes by ping if live_router_check_interval or dead_router_check_interval are set to something other than 0, and there are routes configured. If these conditions are not met the router checker sleeps until woken up when a route is added. It is also woken up whenever the RC is being stopped to ensure the thread doesn't hang. In the future when DLC starts configuring the live and dead router_check_interval parameters, then by manipulating them the router checker can be turned on and off by the user. Signed-off-by: Amir Shehata Change-Id: I778690755e7121abd575f1a261637cb6dc754edd Reviewed-on: http://review.whamcloud.com/13035 Tested-by: Jenkins Reviewed-by: Liang Zhen Tested-by: Maloo Reviewed-by: Doug Oucharek Reviewed-by: James Simmons Reviewed-by: Oleg Drokin --- diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 5199f84..2e53b98 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -867,6 +867,11 @@ typedef struct */ bool ln_nis_from_mod_params; + /* waitq for router checker. As long as there are no routes in + * the list, the router checker will sleep on this queue. when + * routes are added the thread will wake up */ + wait_queue_head_t ln_rc_waitq; + #ifndef __KERNEL__ /* Temporary workaround to allow uOSS and test programs force * server mode in userspace. The only place where we use it is diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 3ec8962..2c2f38c 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -105,6 +105,7 @@ lnet_init_locks(void) { spin_lock_init(&the_lnet.ln_eq_wait_lock); init_waitqueue_head(&the_lnet.ln_eq_waitq); + init_waitqueue_head(&the_lnet.ln_rc_waitq); mutex_init(&the_lnet.ln_lnd_mutex); mutex_init(&the_lnet.ln_api_mutex); } diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index ddd83aa..2605687 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -416,6 +416,9 @@ lnet_add_route(__u32 net, unsigned int hops, lnet_nid_t gateway, if (rnet != rnet2) LIBCFS_FREE(rnet, sizeof(*rnet)); + /* indicate to startup the router checker if configured */ + wake_up(&the_lnet.ln_rc_waitq); + return rc; } @@ -1136,11 +1139,6 @@ lnet_router_checker_start(void) return -EINVAL; } - if (!the_lnet.ln_routing && - live_router_check_interval <= 0 && - dead_router_check_interval <= 0) - return 0; - #ifdef __KERNEL__ sema_init(&the_lnet.ln_rc_signal, 0); /* EQ size doesn't matter; the callback is guaranteed to get every @@ -1185,13 +1183,15 @@ lnet_router_checker_start(void) void lnet_router_checker_stop (void) { - int rc; + int rc; - if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN) - return; + if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN) + return; - LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); the_lnet.ln_rc_state = LNET_RC_STATE_STOPPING; + /* wakeup the RC thread if it's sleeping */ + wake_up(&the_lnet.ln_rc_waitq); #ifdef __KERNEL__ /* block until event callback signals exit */ @@ -1289,17 +1289,42 @@ lnet_prune_rc_data(int wait_unlink) #if defined(__KERNEL__) && defined(LNET_ROUTER) +/* + * This function is called to check if the RC should block indefinitely. + * It's called from lnet_router_checker() as well as being passed to + * wait_event_interruptible() to avoid the lost wake_up problem. + * + * When it's called from wait_event_interruptible() it is necessary to + * also not sleep if the rc state is not running to avoid a deadlock + * when the system is shutting down + */ +static inline bool +lnet_router_checker_active(void) +{ + if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) + return true; + + /* Router Checker thread needs to run when routing is enabled in + * order to call lnet_update_ni_status_locked() */ + if (the_lnet.ln_routing) + return true; + + return !list_empty(&the_lnet.ln_routers) && + (live_router_check_interval > 0 || + dead_router_check_interval > 0); +} + static int lnet_router_checker(void *arg) { - lnet_peer_t *rtr; + lnet_peer_t *rtr; struct list_head *entry; - cfs_block_allsigs(); + cfs_block_allsigs(); - LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); - while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) { + while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) { __u64 version; int cpt; int cpt2; @@ -1321,14 +1346,14 @@ rescan: goto rescan; } - lnet_ping_router_locked(rtr); + lnet_ping_router_locked(rtr); - /* NB dropped lock */ - if (version != the_lnet.ln_routers_version) { - /* the routers list has changed */ - goto rescan; - } - } + /* NB dropped lock */ + if (version != the_lnet.ln_routers_version) { + /* the routers list has changed */ + goto rescan; + } + } if (the_lnet.ln_routing) lnet_update_ni_status_locked(); @@ -1340,8 +1365,16 @@ rescan: /* Call cfs_pause() here always adds 1 to load average * because kernel counts # active tasks as nr_running * + nr_uninterruptible. */ - schedule_timeout_and_set_state(TASK_INTERRUPTIBLE, - cfs_time_seconds(1)); + /* if there are any routes then wakeup every second. If + * there are no routes then sleep indefinitely until woken + * up by a user adding a route */ + if (!lnet_router_checker_active()) + wait_event_interruptible(the_lnet.ln_rc_waitq, + lnet_router_checker_active()); + else + wait_event_interruptible_timeout(the_lnet.ln_rc_waitq, + false, + cfs_time_seconds(1)); } LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING);