Whamcloud - gitweb
b=18948
authoranserper <anserper>
Wed, 13 May 2009 20:02:04 +0000 (20:02 +0000)
committeranserper <anserper>
Wed, 13 May 2009 20:02:04 +0000 (20:02 +0000)
i=Andreas Dilger
i=Nathan Rutman

Snappy Recovery (v4)

lustre/include/obd_support.h
lustre/ptlrpc/import.c
lustre/ptlrpc/pinger.c

index ad8ec2a..f2e8328 100644 (file)
@@ -90,6 +90,8 @@ extern unsigned int obd_alloc_fail_rate;
 #define PING_INTERVAL max(obd_timeout / 4, 1U)
 /* a bit more than maximal journal commit time in seconds */
 #define PING_INTERVAL_SHORT 7
+/* maximum server ping service time excluding network latency */
+#define PING_SVC_TIMEOUT 15
 /* Client may skip 1 ping; we must wait at least 2.5. But for multiple
  * failover targets the client only pings one server at a time, and pings
  * can be lost on a loaded network. Since eviction has serious consequences,
@@ -101,8 +103,8 @@ extern unsigned int obd_alloc_fail_rate;
  /* Max connect interval for nonresponsive servers; ~50s to avoid building up
     connect requests in the LND queues, but within obd_timeout so we don't
     miss the recovery window */
-#define CONNECTION_SWITCH_MAX min(50U, max(CONNECTION_SWITCH_MIN,obd_timeout))
-#define CONNECTION_SWITCH_INC 5  /* Connection timeout backoff */
+#define CONNECTION_SWITCH_MAX min(25U, max(CONNECTION_SWITCH_MIN,obd_timeout))
+#define CONNECTION_SWITCH_INC 1  /* Connection timeout backoff */
 #ifndef CRAY_XT3
 /* In general this should be low to have quick detection of a system
    running on a backup server. (If it's too low, import_select_connection
index f7cc6df..cb1eaea 100644 (file)
@@ -499,8 +499,8 @@ static int import_select_connection(struct obd_import *imp)
                 if (at_get(&imp->imp_at.iat_net_latency) <
                     CONNECTION_SWITCH_MAX) {
                         at_add(&imp->imp_at.iat_net_latency,
-                               at_get(&imp->imp_at.iat_net_latency) +
-                               CONNECTION_SWITCH_INC);
+                               MIN(at_get(&imp->imp_at.iat_net_latency) +
+                               CONNECTION_SWITCH_INC, CONNECTION_SWITCH_MAX));
                 }
                 LASSERT(imp_conn->oic_last_attempt);
                 CWARN("%s: tried all connections, increasing latency to %ds\n",
index 8d51078..ef33565 100644 (file)
@@ -97,6 +97,20 @@ int ptlrpc_ping(struct obd_import *imp)
                 DEBUG_REQ(D_INFO, req, "pinging %s->%s",
                           imp->imp_obd->obd_uuid.uuid,
                           obd2cli_tgt(imp->imp_obd));
+
+                /* To quickly detect server failure ping timeouts must be
+                 * kept small.  Therefore we must override/ignore the server
+                 * rpc completion estimate which may be very large since
+                 * it includes non-ping service times.  The right long term
+                 * fix will be to add a per-server (not per-service) thread
+                 * in order to reduce the number of pings in the system in
+                 * general (see bug 12471). */
+                if (!AT_OFF) {
+                        req->rq_timeout = PING_SVC_TIMEOUT +
+                                          at_get(&imp->imp_at.iat_net_latency);
+                        lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
+                }
+
                 ptlrpcd_add_req(req);
         } else {
                 CERROR("OOM trying to ping %s->%s\n",
@@ -109,17 +123,33 @@ int ptlrpc_ping(struct obd_import *imp)
 }
 EXPORT_SYMBOL(ptlrpc_ping);
 
-void ptlrpc_update_next_ping(struct obd_import *imp, int soon)
+static void ptlrpc_update_next_ping(struct obd_import *imp, int soon)
 {
 #ifdef ENABLE_PINGER
-        int time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL;
-        if (imp->imp_state == LUSTRE_IMP_DISCON) {
-                int dtime = max_t(int, CONNECTION_SWITCH_MIN,
-                                  AT_OFF ? 0 :
-                                  at_get(&imp->imp_at.iat_net_latency));
-                time = min(time, dtime);
+        cfs_time_t delay, dtime, ctime = cfs_time_current();
+
+        if (imp->imp_state == LUSTRE_IMP_DISCON ||
+            imp->imp_state == LUSTRE_IMP_CONNECTING) {
+                /* In the disconnected case aggressively reconnect, for
+                 * this request the AT service timeout will be set to
+                 * INITIAL_CONNECT_TIMEOUT.  To ensure the request times
+                 * out before we send another we add one extra second. */
+                dtime = cfs_time_seconds(max_t(int, CONNECTION_SWITCH_MIN,
+                                AT_OFF ? 0 : INITIAL_CONNECT_TIMEOUT + 1 +
+                                at_get(&imp->imp_at.iat_net_latency)));
+        } else {
+                /* In the common case we want to cluster the pings at
+                 * at regular intervals to minimize system noise. */
+                delay = cfs_time_seconds(soon ? PING_INTERVAL_SHORT :
+                                         PING_INTERVAL);
+                dtime = delay - (ctime % delay);
         }
-        imp->imp_next_ping = cfs_time_shift(time);
+        /* May harmlessly race with ptlrpc_update_next_ping() */
+        imp->imp_next_ping = cfs_time_add(ctime, dtime);
+
+        CDEBUG(D_HA, "Setting %s next ping to "CFS_TIME_T" ("CFS_TIME_T")\n",
+               obd2cli_tgt(imp->imp_obd), imp->imp_next_ping, dtime);
+
 #endif /* ENABLE_PINGER */
 }
 
@@ -171,9 +201,13 @@ static int ptlrpc_pinger_main(void *arg)
                 cfs_time_t this_ping = cfs_time_current();
                 struct l_wait_info lwi;
                 cfs_duration_t time_to_next_wake;
+                cfs_time_t time_of_next_wake;
                 struct timeout_item *item;
                 struct list_head *iter;
 
+                time_to_next_wake = cfs_time_seconds(PING_INTERVAL);
+                time_of_next_wake = cfs_time_shift(PING_INTERVAL);
+
                 mutex_down(&pinger_sem);
                 list_for_each_entry(item, &timeout_list, ti_chain) {
                         item->ti_cb(item, item->ti_cb_data);
@@ -195,16 +229,17 @@ static int ptlrpc_pinger_main(void *arg)
                                ptlrpc_import_state_name(level), level,
                                force, imp->imp_deactive, imp->imp_pingable);
 
+                        /* Include any ping which misses the deadline by up to
+                         * 1/10 of a second.  The pings are designed to clump
+                         * and this helps ensure the entire batch gets sent
+                         * promptly, which minimizes system noise from pings */
+
                         if (force ||
-                            /* if the next ping is within, say, 5 jiffies from
-                               now, go ahead and ping. See note below. */
-                            cfs_time_aftereq(this_ping, 
-                                             imp->imp_next_ping - 5 * CFS_TICK)) {
+                            cfs_time_aftereq(this_ping, imp->imp_next_ping -
+                                             (cfs_time_seconds(1) + 9) / 10)) {
                                 if (level == LUSTRE_IMP_DISCON &&
                                     !imp_is_deactive(imp)) {
-                                        /* wait at least a timeout before
-                                           trying recovery again. */
-                                        imp->imp_next_ping = cfs_time_shift(obd_timeout);
+                                        ptlrpc_update_next_ping(imp, 0);
                                         ptlrpc_initiate_recovery(imp);
                                 } else if (level != LUSTRE_IMP_FULL ||
                                          imp->imp_obd->obd_no_recov ||
@@ -218,6 +253,11 @@ static int ptlrpc_pinger_main(void *arg)
                                                imp->imp_obd->obd_no_recov);
                                 } else if (imp->imp_pingable || force) {
                                                 ptlrpc_ping(imp);
+                                                /* ptlrpc_pinger_sending_on_import()
+                                                 * will asynch update imp_next_ping
+                                                 * so it must not be used below to
+                                                 * calculate minimum wait time. */
+                                                continue;
                                 }
                         } else {
                                 if (!imp->imp_pingable)
@@ -229,30 +269,23 @@ static int ptlrpc_pinger_main(void *arg)
                                        imp->imp_next_ping, this_ping);
                         }
 
-                        /* obd_timeout might have changed */
-                        if (cfs_time_after(imp->imp_next_ping,
-                                           cfs_time_add(this_ping, 
-                                                        cfs_time_seconds(PING_INTERVAL))))
-                                ptlrpc_update_next_ping(imp, 0);
+                        /* Wait time until next ping, or until we stopped. */
+                        if (cfs_time_before(imp->imp_next_ping,
+                                            time_of_next_wake)) {
+                                time_of_next_wake = imp->imp_next_ping;
+                                time_to_next_wake = max_t(cfs_duration_t,
+                                        cfs_time_seconds(1),
+                                        cfs_time_sub(time_of_next_wake,
+                                                     cfs_time_current()));
+                        }
                 }
                 mutex_up(&pinger_sem);
-                /* update memory usage info */
                 obd_update_maxusage();
+                CDEBUG(D_INFO, "next ping in "CFS_DURATION_T" ("CFS_TIME_T")\n",
+                               time_to_next_wake, time_of_next_wake);
 
-                /* Wait until the next ping time, or until we're stopped. */
-                time_to_next_wake = pinger_check_timeout(this_ping);
-                /* The ping sent by ptlrpc_send_rpc may get sent out
-                   say .01 second after this.
-                   ptlrpc_pinger_eending_on_import will then set the
-                   next ping time to next_ping + .01 sec, which means
-                   we will SKIP the next ping at next_ping, and the
-                   ping will get sent 2 timeouts from now!  Beware. */
-                CDEBUG(D_INFO, "next wakeup in "CFS_DURATION_T" ("CFS_TIME_T")\n",
-                               time_to_next_wake,
-                               cfs_time_add(this_ping, cfs_time_seconds(PING_INTERVAL)));
                 if (time_to_next_wake > 0) {
-                        lwi = LWI_TIMEOUT(max_t(cfs_duration_t, time_to_next_wake, cfs_time_seconds(1)),
-                                          NULL, NULL);
+                        lwi = LWI_TIMEOUT(time_to_next_wake, NULL, NULL);
                         l_wait_event(thread->t_ctl_waitq,
                                      thread->t_flags & (SVC_STOPPING|SVC_EVENT),
                                      &lwi);
@@ -700,8 +733,9 @@ static int pinger_check_rpcs(void *arg)
                         list_entry(iter, struct obd_import, imp_pinger_chain);
                 int generation, level;
 
-                if (cfs_time_aftereq(pd->pd_this_ping, 
-                                     imp->imp_next_ping - 5 * CFS_TICK)) {
+                /* Include any ping within 1/10 of a second of the deadline */
+                if (cfs_time_aftereq(pd->pd_this_ping, imp->imp_next_ping -
+                                     (cfs_time_seconds(1) + 9) / 10)) {
                         /* Add a ping. */
                         spin_lock(&imp->imp_lock);
                         generation = imp->imp_generation;