Whamcloud - gitweb
b=18414
authormaxim <maxim>
Tue, 7 Apr 2009 11:09:13 +0000 (11:09 +0000)
committermaxim <maxim>
Tue, 7 Apr 2009 11:09:13 +0000 (11:09 +0000)
i=isaac
i=liang
Landing a patch fixing 'running out of ports' issue on HEAD. The patch
essentially does:
 - add a delay before next reconnect attempt in ksocklnd in the case of lost race;
 - limit the frequency of query-requests in lnet;
 - special handling of 'dead peer' notifications in lnet.

libcfs/include/libcfs/linux/linux-prim.h
libcfs/include/libcfs/winnt/winnt-prim.h
libcfs/libcfs/winnt/winnt-curproc.c
lnet/include/lnet/lib-types.h
lnet/klnds/socklnd/socklnd_cb.c
lnet/lnet/lib-move.c
lnet/lnet/peer.c
lnet/lnet/router.c

index 23cc250..b1f09b1 100644 (file)
@@ -188,6 +188,7 @@ typedef sigset_t                        cfs_sigset_t;
  */
 typedef struct timer_list cfs_timer_t;
 
+#define CFS_MAX_SCHEDULE_TIMEOUT MAX_SCHEDULE_TIMEOUT
 
 #ifndef wait_event_timeout /* Only for RHEL3 2.4.21 kernel */
 #define __wait_event_timeout(wq, condition, timeout, ret)        \
index c5cbf65..accf7a9 100644 (file)
@@ -564,8 +564,8 @@ typedef __u32 cfs_kernel_cap_t;
  * Task struct
  */
 
-#define MAX_SCHEDULE_TIMEOUT    ((long_ptr_t)(~0UL>>12))
-#define schedule_timeout(t)     cfs_schedule_timeout(0, t)
+#define CFS_MAX_SCHEDULE_TIMEOUT ((long_ptr_t)(~0UL>>12))
+#define schedule_timeout(t)      cfs_schedule_timeout(0, t)
 
 struct vfsmount;
 
index 93d3af6..5b07ffa 100644 (file)
@@ -454,7 +454,7 @@ cfs_schedule_timeout(cfs_task_state_t state, int64_t time)
     slot = CONTAINING_RECORD(task, TASK_SLOT, task);
     cfs_assert(slot->Magic == TASKSLT_MAGIC);
 
-    if (time == MAX_SCHEDULE_TIMEOUT) {
+    if (time == CFS_MAX_SCHEDULE_TIMEOUT) {
         time = 0;
     }
 
index 9c8b498..fdd69f2 100644 (file)
@@ -410,6 +410,7 @@ typedef struct lnet_peer {
         long              lp_txqnob;            /* bytes queued for sending */
         time_t            lp_timestamp;         /* time of last aliveness news */
         time_t            lp_last_alive;        /* when I was last alive */
+        time_t            lp_last_query;        /* when LND was queried last time */
         time_t            lp_ping_timestamp;    /* time of last ping attempt */
         time_t            lp_ping_deadline;     /* != 0 if ping reply expected */
         lnet_ni_t        *lp_ni;                /* interface peer is on */
index e12800b..3be5ff5 100644 (file)
@@ -1917,6 +1917,18 @@ ksocknal_connect (ksock_route_t *route)
         if (retry_later) {
                 /* re-queue for attention; this frees me up to handle
                  * the peer's incoming connection request */
+
+                if (rc == EALREADY) {
+                        /* We want to introduce a delay before next
+                         * attempt to connect if we lost conn race,
+                         * but the race is resolved quickly usually,
+                         * so min_reconnectms should be good heruistic */
+                        route->ksnr_retry_interval =
+                                cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000;
+                        route->ksnr_timeout = cfs_time_add(cfs_time_current(),
+                                                           route->ksnr_retry_interval);
+                }
+
                 ksocknal_launch_connection_locked(route);
         }
 
@@ -1972,30 +1984,36 @@ ksocknal_connect (ksock_route_t *route)
         ksocknal_txlist_done(peer->ksnp_ni, &zombies, 1);
 }
 
-static inline int
-ksocknal_connd_connect_route_locked(void)
+/* Go through connd_routes queue looking for a route that
+   we can process right now */
+static ksock_route_t *
+ksocknal_connd_get_route_locked(signed long *timeout_p)
 {
-        /* Only handle an outgoing connection request if there is someone left
-         * to handle incoming connections */
-        return !list_empty(&ksocknal_data.ksnd_connd_routes) &&
-                ((ksocknal_data.ksnd_connd_connecting + 1) <
-                 *ksocknal_tunables.ksnd_nconnds);
-}
+        ksock_route_t *route;
+        cfs_time_t     now;
 
-static inline int
-ksocknal_connd_ready(void)
-{
-        int            rc;
+        /* Only handle an outgoing connection request if there
+         * is someone left to handle incoming connections */
+        if ((ksocknal_data.ksnd_connd_connecting + 1) >=
+            *ksocknal_tunables.ksnd_nconnds)
+                return NULL;
 
-        cfs_spin_lock_bh (&ksocknal_data.ksnd_connd_lock);
+        now = cfs_time_current();
 
-        rc = ksocknal_data.ksnd_shuttingdown ||
-             !list_empty(&ksocknal_data.ksnd_connd_connreqs) ||
-             ksocknal_connd_connect_route_locked();
+        /* connd_routes can contain both pending and ordinary routes */
+        list_for_each_entry (route, &ksocknal_data.ksnd_connd_routes,
+                             ksnr_connd_list) {
 
-        cfs_spin_unlock_bh (&ksocknal_data.ksnd_connd_lock);
+                if (route->ksnr_retry_interval == 0 ||
+                    cfs_time_aftereq(now, route->ksnr_timeout))
+                        return route;
 
-        return rc;
+                if (*timeout_p == CFS_MAX_SCHEDULE_TIMEOUT ||
+                    (int)*timeout_p > (int)(route->ksnr_timeout - now))
+                        *timeout_p = (int)(route->ksnr_timeout - now);
+        }
+
+        return NULL;
 }
 
 int
@@ -2005,16 +2023,22 @@ ksocknal_connd (void *arg)
         char               name[16];
         ksock_connreq_t   *cr;
         ksock_route_t     *route;
-        int                rc = 0;
+        cfs_waitlink_t     wait;
+        signed long        timeout;
+        int                dropped_lock;
 
         snprintf (name, sizeof (name), "socknal_cd%02ld", id);
         cfs_daemonize (name);
         cfs_block_allsigs ();
 
+        cfs_waitlink_init (&wait);
+
         cfs_spin_lock_bh (&ksocknal_data.ksnd_connd_lock);
 
         while (!ksocknal_data.ksnd_shuttingdown) {
 
+                dropped_lock = 0;
+
                 if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) {
                         /* Connection accepted by the listener */
                         cr = list_entry(ksocknal_data.ksnd_connd_connreqs.next,
@@ -2022,6 +2046,7 @@ ksocknal_connd (void *arg)
 
                         list_del(&cr->ksncr_list);
                         cfs_spin_unlock_bh (&ksocknal_data.ksnd_connd_lock);
+                        dropped_lock = 1;
 
                         ksocknal_create_conn(cr->ksncr_ni, NULL,
                                              cr->ksncr_sock, SOCKLND_CONN_NONE);
@@ -2031,14 +2056,17 @@ ksocknal_connd (void *arg)
                         cfs_spin_lock_bh (&ksocknal_data.ksnd_connd_lock);
                 }
 
-                if (ksocknal_connd_connect_route_locked()) {
-                        /* Connection request */
-                        route = list_entry (ksocknal_data.ksnd_connd_routes.next,
-                                            ksock_route_t, ksnr_connd_list);
+                /* Sleep till explicit wake_up if no pending routes present */
+                timeout = CFS_MAX_SCHEDULE_TIMEOUT;
+
+                /* Connection request */
+                route = ksocknal_connd_get_route_locked(&timeout);
 
+                if (route != NULL) {
                         list_del (&route->ksnr_connd_list);
                         ksocknal_data.ksnd_connd_connecting++;
                         cfs_spin_unlock_bh (&ksocknal_data.ksnd_connd_lock);
+                        dropped_lock = 1;
 
                         ksocknal_connect (route);
                         ksocknal_route_decref(route);
@@ -2047,12 +2075,18 @@ ksocknal_connd (void *arg)
                         ksocknal_data.ksnd_connd_connecting--;
                 }
 
+                if (dropped_lock)
+                        continue;
+
+                /* Nothing to do for 'timeout'  */
+                cfs_set_current_state (CFS_TASK_INTERRUPTIBLE);
+                cfs_waitq_add_exclusive (&ksocknal_data.ksnd_connd_waitq, &wait);
                 cfs_spin_unlock_bh (&ksocknal_data.ksnd_connd_lock);
 
-                cfs_wait_event_interruptible_exclusive(
-                        ksocknal_data.ksnd_connd_waitq,
-                        ksocknal_connd_ready(), rc);
+                cfs_waitq_timedwait (&wait, CFS_TASK_INTERRUPTIBLE, timeout);
 
+                cfs_set_current_state (CFS_TASK_RUNNING);
+                cfs_waitq_del (&ksocknal_data.ksnd_connd_waitq, &wait);
                 cfs_spin_lock_bh (&ksocknal_data.ksnd_connd_lock);
         }
 
index 9d50747..a0bebbf 100644 (file)
@@ -914,6 +914,8 @@ lnet_ni_peer_alive(lnet_peer_t *lp)
         (ni->ni_lnd->lnd_query)(ni, lp->lp_nid, &last_alive);
         LNET_LOCK();
 
+        lp->lp_last_query = cfs_time_current_sec();
+
         if (last_alive != 0) /* NI has updated timestamp */
                 lp->lp_last_alive = last_alive;
         return;
@@ -942,6 +944,9 @@ lnet_peer_is_alive (lnet_peer_t *lp, time_t now)
         return alive;
 }
 
+/* don't query LND about aliveness of a dead peer more frequently than: */
+static int lnet_queryinterval = 1; /* 1 second */
+
 /* NB: returns 1 when alive, 0 when dead, negative when error;
  *     may drop the LNET_LOCK */
 int
@@ -958,7 +963,24 @@ lnet_peer_alive_locked (lnet_peer_t *lp)
         if (lnet_peer_is_alive(lp, now))
                 return 1;
 
-        /* peer appears dead, query LND for latest aliveness news */
+        /* peer appears dead, should we query right now? */
+        if (lp->lp_last_query != 0) {
+                time_t deadline =
+                        cfs_time_add(lp->lp_last_query,
+                                     lnet_queryinterval);
+
+                if (cfs_time_before(now, deadline)) {
+                        if (lp->lp_alive)
+                                CWARN("Unexpected aliveness of peer %s: "
+                                      "%d < %d (%d/%d)\n",
+                                      libcfs_nid2str(lp->lp_nid),
+                                      (int)now, (int)deadline,
+                                      lnet_queryinterval, ni->ni_peertimeout);
+                        return 0;
+                }
+        }
+
+        /* query LND for latest aliveness news */
         lnet_ni_peer_alive(lp);
 
         if (lnet_peer_is_alive(lp, now))
index 7d68e6d..c8ad591 100644 (file)
@@ -184,11 +184,12 @@ lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid)
         lp->lp_notifylnd = 0;
         lp->lp_notifying = 0;
         lp->lp_alive_count = 0;
-       lp->lp_timestamp = 0;
-       lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */
+        lp->lp_timestamp = 0;
+        lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */
         lp->lp_last_alive = cfs_time_current_sec(); /* assumes alive */
+        lp->lp_last_query = 0; /* didn't ask LND yet */
         lp->lp_ping_timestamp = 0;
-       lp->lp_nid = nid;
+        lp->lp_nid = nid;
         lp->lp_refcount = 2;                    /* 1 for caller; 1 for hash */
         lp->lp_rtr_refcount = 0;
 
index 3a211fa..2f9cc01 100644 (file)
@@ -184,6 +184,13 @@ lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, time_t when)
                 return 0;
         }
 
+        /* We can't fully trust LND on reporting exact peer last_alive
+         * if he notifies us about dead peer. For example ksocklnd can
+         * call us with when == _time_when_the_node_was_booted_ if
+         * no connections were successfully established */
+        if (ni != NULL && !alive && when < lp->lp_last_alive)
+                when = lp->lp_last_alive;
+
         lnet_notify_locked(lp, ni == NULL, alive, when);
 
         LNET_UNLOCK();