From b418bd36742a4c1d3f28dd2ee5823d54ece76035 Mon Sep 17 00:00:00 2001 From: maxim Date: Tue, 7 Apr 2009 11:09:13 +0000 Subject: [PATCH] b=18414 i=isaac i=liang Landing a patch fixing 'running out of ports' issue on HEAD. The patch essentially does: - add a delay before next reconnect attempt in ksocklnd in the case of lost race; - limit the frequency of query-requests in lnet; - special handling of 'dead peer' notifications in lnet. --- libcfs/include/libcfs/linux/linux-prim.h | 1 + libcfs/include/libcfs/winnt/winnt-prim.h | 4 +- libcfs/libcfs/winnt/winnt-curproc.c | 2 +- lnet/include/lnet/lib-types.h | 1 + lnet/klnds/socklnd/socklnd_cb.c | 86 ++++++++++++++++++++++---------- lnet/lnet/lib-move.c | 24 ++++++++- lnet/lnet/peer.c | 7 +-- lnet/lnet/router.c | 7 +++ 8 files changed, 99 insertions(+), 33 deletions(-) diff --git a/libcfs/include/libcfs/linux/linux-prim.h b/libcfs/include/libcfs/linux/linux-prim.h index 23cc250..b1f09b1 100644 --- a/libcfs/include/libcfs/linux/linux-prim.h +++ b/libcfs/include/libcfs/linux/linux-prim.h @@ -188,6 +188,7 @@ typedef sigset_t cfs_sigset_t; */ typedef struct timer_list cfs_timer_t; +#define CFS_MAX_SCHEDULE_TIMEOUT MAX_SCHEDULE_TIMEOUT #ifndef wait_event_timeout /* Only for RHEL3 2.4.21 kernel */ #define __wait_event_timeout(wq, condition, timeout, ret) \ diff --git a/libcfs/include/libcfs/winnt/winnt-prim.h b/libcfs/include/libcfs/winnt/winnt-prim.h index c5cbf65..accf7a9 100644 --- a/libcfs/include/libcfs/winnt/winnt-prim.h +++ b/libcfs/include/libcfs/winnt/winnt-prim.h @@ -564,8 +564,8 @@ typedef __u32 cfs_kernel_cap_t; * Task struct */ -#define MAX_SCHEDULE_TIMEOUT ((long_ptr_t)(~0UL>>12)) -#define schedule_timeout(t) cfs_schedule_timeout(0, t) +#define CFS_MAX_SCHEDULE_TIMEOUT ((long_ptr_t)(~0UL>>12)) +#define schedule_timeout(t) cfs_schedule_timeout(0, t) struct vfsmount; diff --git a/libcfs/libcfs/winnt/winnt-curproc.c b/libcfs/libcfs/winnt/winnt-curproc.c index 93d3af6..5b07ffa 100644 --- a/libcfs/libcfs/winnt/winnt-curproc.c +++ b/libcfs/libcfs/winnt/winnt-curproc.c @@ -454,7 +454,7 @@ cfs_schedule_timeout(cfs_task_state_t state, int64_t time) slot = CONTAINING_RECORD(task, TASK_SLOT, task); cfs_assert(slot->Magic == TASKSLT_MAGIC); - if (time == MAX_SCHEDULE_TIMEOUT) { + if (time == CFS_MAX_SCHEDULE_TIMEOUT) { time = 0; } diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 9c8b498..fdd69f2 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -410,6 +410,7 @@ typedef struct lnet_peer { long lp_txqnob; /* bytes queued for sending */ time_t lp_timestamp; /* time of last aliveness news */ time_t lp_last_alive; /* when I was last alive */ + time_t lp_last_query; /* when LND was queried last time */ time_t lp_ping_timestamp; /* time of last ping attempt */ time_t lp_ping_deadline; /* != 0 if ping reply expected */ lnet_ni_t *lp_ni; /* interface peer is on */ diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index e12800b..3be5ff5 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -1917,6 +1917,18 @@ ksocknal_connect (ksock_route_t *route) if (retry_later) { /* re-queue for attention; this frees me up to handle * the peer's incoming connection request */ + + if (rc == EALREADY) { + /* We want to introduce a delay before next + * attempt to connect if we lost conn race, + * but the race is resolved quickly usually, + * so min_reconnectms should be good heruistic */ + route->ksnr_retry_interval = + cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000; + route->ksnr_timeout = cfs_time_add(cfs_time_current(), + route->ksnr_retry_interval); + } + ksocknal_launch_connection_locked(route); } @@ -1972,30 +1984,36 @@ ksocknal_connect (ksock_route_t *route) ksocknal_txlist_done(peer->ksnp_ni, &zombies, 1); } -static inline int -ksocknal_connd_connect_route_locked(void) +/* Go through connd_routes queue looking for a route that + we can process right now */ +static ksock_route_t * +ksocknal_connd_get_route_locked(signed long *timeout_p) { - /* Only handle an outgoing connection request if there is someone left - * to handle incoming connections */ - return !list_empty(&ksocknal_data.ksnd_connd_routes) && - ((ksocknal_data.ksnd_connd_connecting + 1) < - *ksocknal_tunables.ksnd_nconnds); -} + ksock_route_t *route; + cfs_time_t now; -static inline int -ksocknal_connd_ready(void) -{ - int rc; + /* Only handle an outgoing connection request if there + * is someone left to handle incoming connections */ + if ((ksocknal_data.ksnd_connd_connecting + 1) >= + *ksocknal_tunables.ksnd_nconnds) + return NULL; - cfs_spin_lock_bh (&ksocknal_data.ksnd_connd_lock); + now = cfs_time_current(); - rc = ksocknal_data.ksnd_shuttingdown || - !list_empty(&ksocknal_data.ksnd_connd_connreqs) || - ksocknal_connd_connect_route_locked(); + /* connd_routes can contain both pending and ordinary routes */ + list_for_each_entry (route, &ksocknal_data.ksnd_connd_routes, + ksnr_connd_list) { - cfs_spin_unlock_bh (&ksocknal_data.ksnd_connd_lock); + if (route->ksnr_retry_interval == 0 || + cfs_time_aftereq(now, route->ksnr_timeout)) + return route; - return rc; + if (*timeout_p == CFS_MAX_SCHEDULE_TIMEOUT || + (int)*timeout_p > (int)(route->ksnr_timeout - now)) + *timeout_p = (int)(route->ksnr_timeout - now); + } + + return NULL; } int @@ -2005,16 +2023,22 @@ ksocknal_connd (void *arg) char name[16]; ksock_connreq_t *cr; ksock_route_t *route; - int rc = 0; + cfs_waitlink_t wait; + signed long timeout; + int dropped_lock; snprintf (name, sizeof (name), "socknal_cd%02ld", id); cfs_daemonize (name); cfs_block_allsigs (); + cfs_waitlink_init (&wait); + cfs_spin_lock_bh (&ksocknal_data.ksnd_connd_lock); while (!ksocknal_data.ksnd_shuttingdown) { + dropped_lock = 0; + if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) { /* Connection accepted by the listener */ cr = list_entry(ksocknal_data.ksnd_connd_connreqs.next, @@ -2022,6 +2046,7 @@ ksocknal_connd (void *arg) list_del(&cr->ksncr_list); cfs_spin_unlock_bh (&ksocknal_data.ksnd_connd_lock); + dropped_lock = 1; ksocknal_create_conn(cr->ksncr_ni, NULL, cr->ksncr_sock, SOCKLND_CONN_NONE); @@ -2031,14 +2056,17 @@ ksocknal_connd (void *arg) cfs_spin_lock_bh (&ksocknal_data.ksnd_connd_lock); } - if (ksocknal_connd_connect_route_locked()) { - /* Connection request */ - route = list_entry (ksocknal_data.ksnd_connd_routes.next, - ksock_route_t, ksnr_connd_list); + /* Sleep till explicit wake_up if no pending routes present */ + timeout = CFS_MAX_SCHEDULE_TIMEOUT; + + /* Connection request */ + route = ksocknal_connd_get_route_locked(&timeout); + if (route != NULL) { list_del (&route->ksnr_connd_list); ksocknal_data.ksnd_connd_connecting++; cfs_spin_unlock_bh (&ksocknal_data.ksnd_connd_lock); + dropped_lock = 1; ksocknal_connect (route); ksocknal_route_decref(route); @@ -2047,12 +2075,18 @@ ksocknal_connd (void *arg) ksocknal_data.ksnd_connd_connecting--; } + if (dropped_lock) + continue; + + /* Nothing to do for 'timeout' */ + cfs_set_current_state (CFS_TASK_INTERRUPTIBLE); + cfs_waitq_add_exclusive (&ksocknal_data.ksnd_connd_waitq, &wait); cfs_spin_unlock_bh (&ksocknal_data.ksnd_connd_lock); - cfs_wait_event_interruptible_exclusive( - ksocknal_data.ksnd_connd_waitq, - ksocknal_connd_ready(), rc); + cfs_waitq_timedwait (&wait, CFS_TASK_INTERRUPTIBLE, timeout); + cfs_set_current_state (CFS_TASK_RUNNING); + cfs_waitq_del (&ksocknal_data.ksnd_connd_waitq, &wait); cfs_spin_lock_bh (&ksocknal_data.ksnd_connd_lock); } diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 9d50747..a0bebbf 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -914,6 +914,8 @@ lnet_ni_peer_alive(lnet_peer_t *lp) (ni->ni_lnd->lnd_query)(ni, lp->lp_nid, &last_alive); LNET_LOCK(); + lp->lp_last_query = cfs_time_current_sec(); + if (last_alive != 0) /* NI has updated timestamp */ lp->lp_last_alive = last_alive; return; @@ -942,6 +944,9 @@ lnet_peer_is_alive (lnet_peer_t *lp, time_t now) return alive; } +/* don't query LND about aliveness of a dead peer more frequently than: */ +static int lnet_queryinterval = 1; /* 1 second */ + /* NB: returns 1 when alive, 0 when dead, negative when error; * may drop the LNET_LOCK */ int @@ -958,7 +963,24 @@ lnet_peer_alive_locked (lnet_peer_t *lp) if (lnet_peer_is_alive(lp, now)) return 1; - /* peer appears dead, query LND for latest aliveness news */ + /* peer appears dead, should we query right now? */ + if (lp->lp_last_query != 0) { + time_t deadline = + cfs_time_add(lp->lp_last_query, + lnet_queryinterval); + + if (cfs_time_before(now, deadline)) { + if (lp->lp_alive) + CWARN("Unexpected aliveness of peer %s: " + "%d < %d (%d/%d)\n", + libcfs_nid2str(lp->lp_nid), + (int)now, (int)deadline, + lnet_queryinterval, ni->ni_peertimeout); + return 0; + } + } + + /* query LND for latest aliveness news */ lnet_ni_peer_alive(lp); if (lnet_peer_is_alive(lp, now)) diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index 7d68e6d4..c8ad591 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -184,11 +184,12 @@ lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid) lp->lp_notifylnd = 0; lp->lp_notifying = 0; lp->lp_alive_count = 0; - lp->lp_timestamp = 0; - lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */ + lp->lp_timestamp = 0; + lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */ lp->lp_last_alive = cfs_time_current_sec(); /* assumes alive */ + lp->lp_last_query = 0; /* didn't ask LND yet */ lp->lp_ping_timestamp = 0; - lp->lp_nid = nid; + lp->lp_nid = nid; lp->lp_refcount = 2; /* 1 for caller; 1 for hash */ lp->lp_rtr_refcount = 0; diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index 3a211fa..2f9cc01 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -184,6 +184,13 @@ lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, time_t when) return 0; } + /* We can't fully trust LND on reporting exact peer last_alive + * if he notifies us about dead peer. For example ksocklnd can + * call us with when == _time_when_the_node_was_booted_ if + * no connections were successfully established */ + if (ni != NULL && !alive && when < lp->lp_last_alive) + when = lp->lp_last_alive; + lnet_notify_locked(lp, ni == NULL, alive, when); LNET_UNLOCK(); -- 1.8.3.1