b=18948

author anserper <anserper>

Wed, 13 May 2009 20:02:04 +0000 (20:02 +0000)

committer anserper <anserper>

Wed, 13 May 2009 20:02:04 +0000 (20:02 +0000)
author anserper <anserper>
Wed, 13 May 2009 20:02:04 +0000 (20:02 +0000)
committer anserper <anserper>
Wed, 13 May 2009 20:02:04 +0000 (20:02 +0000)
diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h

index ad8ec2a..f2e8328 100644 (file)
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -90,6 +90,8 @@ extern unsigned int obd_alloc_fail_rate;
  #define PING_INTERVAL max(obd_timeout / 4, 1U)
  /* a bit more than maximal journal commit time in seconds */
  #define PING_INTERVAL_SHORT 7
+/* maximum server ping service time excluding network latency */
+#define PING_SVC_TIMEOUT 15
  /* Client may skip 1 ping; we must wait at least 2.5. But for multiple
   * failover targets the client only pings one server at a time, and pings
   * can be lost on a loaded network. Since eviction has serious consequences,
@@ -101,8 +103,8 @@ extern unsigned int obd_alloc_fail_rate;
   /* Max connect interval for nonresponsive servers; ~50s to avoid building up
      connect requests in the LND queues, but within obd_timeout so we don't
      miss the recovery window */
-#define CONNECTION_SWITCH_MAX min(50U, max(CONNECTION_SWITCH_MIN,obd_timeout))
-#define CONNECTION_SWITCH_INC 5  /* Connection timeout backoff */
+#define CONNECTION_SWITCH_MAX min(25U, max(CONNECTION_SWITCH_MIN,obd_timeout))
+#define CONNECTION_SWITCH_INC 1  /* Connection timeout backoff */
  #ifndef CRAY_XT3
  /* In general this should be low to have quick detection of a system
     running on a backup server. (If it's too low, import_select_connection
diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c

index f7cc6df..cb1eaea 100644 (file)
--- a/lustre/ptlrpc/import.c
+++ b/lustre/ptlrpc/import.c
@@ -499,8 +499,8 @@ static int import_select_connection(struct obd_import *imp)
                  if (at_get(&imp->imp_at.iat_net_latency) <
                      CONNECTION_SWITCH_MAX) {
                          at_add(&imp->imp_at.iat_net_latency,
-                               at_get(&imp->imp_at.iat_net_latency) +
-                               CONNECTION_SWITCH_INC);
+                               MIN(at_get(&imp->imp_at.iat_net_latency) +
+                               CONNECTION_SWITCH_INC, CONNECTION_SWITCH_MAX));
                  }
                  LASSERT(imp_conn->oic_last_attempt);
                  CWARN("%s: tried all connections, increasing latency to %ds\n",
diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c

index 8d51078..ef33565 100644 (file)
--- a/lustre/ptlrpc/pinger.c
+++ b/lustre/ptlrpc/pinger.c
@@ -97,6 +97,20 @@ int ptlrpc_ping(struct obd_import *imp)
                  DEBUG_REQ(D_INFO, req, "pinging %s->%s",
                            imp->imp_obd->obd_uuid.uuid,
                            obd2cli_tgt(imp->imp_obd));
+
+                /* To quickly detect server failure ping timeouts must be
+                 * kept small.  Therefore we must override/ignore the server
+                 * rpc completion estimate which may be very large since
+                 * it includes non-ping service times.  The right long term
+                 * fix will be to add a per-server (not per-service) thread
+                 * in order to reduce the number of pings in the system in
+                 * general (see bug 12471). */
+                if (!AT_OFF) {
+                        req->rq_timeout = PING_SVC_TIMEOUT +
+                                          at_get(&imp->imp_at.iat_net_latency);
+                        lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
+                }
+
                  ptlrpcd_add_req(req);
          } else {
                  CERROR("OOM trying to ping %s->%s\n",
@@ -109,17 +123,33 @@ int ptlrpc_ping(struct obd_import *imp)
  }
  EXPORT_SYMBOL(ptlrpc_ping);
  
-void ptlrpc_update_next_ping(struct obd_import *imp, int soon)
+static void ptlrpc_update_next_ping(struct obd_import *imp, int soon)
  {
  #ifdef ENABLE_PINGER
-        int time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL;
-        if (imp->imp_state == LUSTRE_IMP_DISCON) {
-                int dtime = max_t(int, CONNECTION_SWITCH_MIN,
-                                  AT_OFF ? 0 :
-                                  at_get(&imp->imp_at.iat_net_latency));
-                time = min(time, dtime);
+        cfs_time_t delay, dtime, ctime = cfs_time_current();
+
+        if (imp->imp_state == LUSTRE_IMP_DISCON ||
+            imp->imp_state == LUSTRE_IMP_CONNECTING) {
+                /* In the disconnected case aggressively reconnect, for
+                 * this request the AT service timeout will be set to
+                 * INITIAL_CONNECT_TIMEOUT.  To ensure the request times
+                 * out before we send another we add one extra second. */
+                dtime = cfs_time_seconds(max_t(int, CONNECTION_SWITCH_MIN,
+                                AT_OFF ? 0 : INITIAL_CONNECT_TIMEOUT + 1 +
+                                at_get(&imp->imp_at.iat_net_latency)));
+        } else {
+                /* In the common case we want to cluster the pings at
+                 * at regular intervals to minimize system noise. */
+                delay = cfs_time_seconds(soon ? PING_INTERVAL_SHORT :
+                                         PING_INTERVAL);
+                dtime = delay - (ctime % delay);
          }
-        imp->imp_next_ping = cfs_time_shift(time);
+        /* May harmlessly race with ptlrpc_update_next_ping() */
+        imp->imp_next_ping = cfs_time_add(ctime, dtime);
+
+        CDEBUG(D_HA, "Setting %s next ping to "CFS_TIME_T" ("CFS_TIME_T")\n",
+               obd2cli_tgt(imp->imp_obd), imp->imp_next_ping, dtime);
+
  #endif /* ENABLE_PINGER */
  }
  
@@ -171,9 +201,13 @@ static int ptlrpc_pinger_main(void *arg)
                  cfs_time_t this_ping = cfs_time_current();
                  struct l_wait_info lwi;
                  cfs_duration_t time_to_next_wake;
+                cfs_time_t time_of_next_wake;
                  struct timeout_item *item;
                  struct list_head *iter;
  
+                time_to_next_wake = cfs_time_seconds(PING_INTERVAL);
+                time_of_next_wake = cfs_time_shift(PING_INTERVAL);
+
                  mutex_down(&pinger_sem);
                  list_for_each_entry(item, &timeout_list, ti_chain) {
                          item->ti_cb(item, item->ti_cb_data);
@@ -195,16 +229,17 @@ static int ptlrpc_pinger_main(void *arg)
                                 ptlrpc_import_state_name(level), level,
                                 force, imp->imp_deactive, imp->imp_pingable);
  
+                        /* Include any ping which misses the deadline by up to
+                         * 1/10 of a second.  The pings are designed to clump
+                         * and this helps ensure the entire batch gets sent
+                         * promptly, which minimizes system noise from pings */
+
                          if (force ||
-                            /* if the next ping is within, say, 5 jiffies from
-                               now, go ahead and ping. See note below. */
-                            cfs_time_aftereq(this_ping, 
-                                             imp->imp_next_ping - 5 * CFS_TICK)) {
+                            cfs_time_aftereq(this_ping, imp->imp_next_ping -
+                                             (cfs_time_seconds(1) + 9) / 10)) {
                                  if (level == LUSTRE_IMP_DISCON &&
                                      !imp_is_deactive(imp)) {
-                                        /* wait at least a timeout before
-                                           trying recovery again. */
-                                        imp->imp_next_ping = cfs_time_shift(obd_timeout);
+                                        ptlrpc_update_next_ping(imp, 0);
                                          ptlrpc_initiate_recovery(imp);
                                  } else if (level != LUSTRE_IMP_FULL ||
                                           imp->imp_obd->obd_no_recov ||
@@ -218,6 +253,11 @@ static int ptlrpc_pinger_main(void *arg)
                                                 imp->imp_obd->obd_no_recov);
                                  } else if (imp->imp_pingable || force) {
                                                  ptlrpc_ping(imp);
+                                                /* ptlrpc_pinger_sending_on_import()
+                                                 * will asynch update imp_next_ping
+                                                 * so it must not be used below to
+                                                 * calculate minimum wait time. */
+                                                continue;
                                  }
                          } else {
                                  if (!imp->imp_pingable)
@@ -229,30 +269,23 @@ static int ptlrpc_pinger_main(void *arg)
                                         imp->imp_next_ping, this_ping);
                          }
  
-                        /* obd_timeout might have changed */
-                        if (cfs_time_after(imp->imp_next_ping,
-                                           cfs_time_add(this_ping, 
-                                                        cfs_time_seconds(PING_INTERVAL))))
-                                ptlrpc_update_next_ping(imp, 0);
+                        /* Wait time until next ping, or until we stopped. */
+                        if (cfs_time_before(imp->imp_next_ping,
+                                            time_of_next_wake)) {
+                                time_of_next_wake = imp->imp_next_ping;
+                                time_to_next_wake = max_t(cfs_duration_t,
+                                        cfs_time_seconds(1),
+                                        cfs_time_sub(time_of_next_wake,
+                                                     cfs_time_current()));
+                        }
                  }
                  mutex_up(&pinger_sem);
-                /* update memory usage info */
                  obd_update_maxusage();
+                CDEBUG(D_INFO, "next ping in "CFS_DURATION_T" ("CFS_TIME_T")\n",
+                               time_to_next_wake, time_of_next_wake);
  
-                /* Wait until the next ping time, or until we're stopped. */
-                time_to_next_wake = pinger_check_timeout(this_ping);
-                /* The ping sent by ptlrpc_send_rpc may get sent out
-                   say .01 second after this.
-                   ptlrpc_pinger_eending_on_import will then set the
-                   next ping time to next_ping + .01 sec, which means
-                   we will SKIP the next ping at next_ping, and the
-                   ping will get sent 2 timeouts from now!  Beware. */
-                CDEBUG(D_INFO, "next wakeup in "CFS_DURATION_T" ("CFS_TIME_T")\n",
-                               time_to_next_wake,
-                               cfs_time_add(this_ping, cfs_time_seconds(PING_INTERVAL)));
                  if (time_to_next_wake > 0) {
-                        lwi = LWI_TIMEOUT(max_t(cfs_duration_t, time_to_next_wake, cfs_time_seconds(1)),
-                                          NULL, NULL);
+                        lwi = LWI_TIMEOUT(time_to_next_wake, NULL, NULL);
                          l_wait_event(thread->t_ctl_waitq,
                                       thread->t_flags & (SVC_STOPPING|SVC_EVENT),
                                       &lwi);
@@ -700,8 +733,9 @@ static int pinger_check_rpcs(void *arg)
                          list_entry(iter, struct obd_import, imp_pinger_chain);
                  int generation, level;
  
-                if (cfs_time_aftereq(pd->pd_this_ping, 
-                                     imp->imp_next_ping - 5 * CFS_TICK)) {
+                /* Include any ping within 1/10 of a second of the deadline */
+                if (cfs_time_aftereq(pd->pd_this_ping, imp->imp_next_ping -
+                                     (cfs_time_seconds(1) + 9) / 10)) {
                          /* Add a ping. */
                          spin_lock(&imp->imp_lock);
                          generation = imp->imp_generation;
author	anserper <anserper>
	Wed, 13 May 2009 20:02:04 +0000 (20:02 +0000)
committer	anserper <anserper>
	Wed, 13 May 2009 20:02:04 +0000 (20:02 +0000)
lustre/include/obd_support.h		patch \| blob \| history
lustre/ptlrpc/import.c		patch \| blob \| history
lustre/ptlrpc/pinger.c		patch \| blob \| history