Whamcloud - gitweb
b=13537
authornathan <nathan>
Mon, 24 Sep 2007 16:53:05 +0000 (16:53 +0000)
committernathan <nathan>
Mon, 24 Sep 2007 16:53:05 +0000 (16:53 +0000)
b=3055
in some cases we wouldn't schedule at ptlrpc_invalidate_import
cap reconnect ping interval
use simple reconnect backoff instead of "drain"

lustre/include/lustre_import.h
lustre/include/obd_support.h
lustre/lov/lproc_lov.c
lustre/obdclass/class_obd.c
lustre/obdclass/genops.c
lustre/obdclass/linux/linux-sysctl.c
lustre/ptlrpc/client.c
lustre/ptlrpc/import.c
lustre/ptlrpc/pinger.c

index cef9318..6dc3840 100644 (file)
@@ -13,7 +13,6 @@
 #define D_ADAPTTO D_OTHER
 #define AT_BINS 4                  /* "bin" means "N seconds of history" */
 #define AT_FLG_NOHIST 0x1          /* use last reported value only */
-#define AT_FLG_MIN    0x2          /* use a minimum limit */
 
 struct adaptive_timeout {
         time_t       at_binstart;         /* bin start time */
@@ -70,7 +69,6 @@ struct imp_at {
         int                     iat_portal[IMP_AT_MAX_PORTALS];
         struct adaptive_timeout iat_net_latency;
         struct adaptive_timeout iat_service_estimate[IMP_AT_MAX_PORTALS];
-        time_t                  iat_drain; /* hack to slow reconnect reqs */
 };
 
 struct obd_import {
index 8bd5753..9b742d3 100644 (file)
@@ -38,7 +38,6 @@ extern unsigned int obd_dump_on_eviction;
    networking / disk / timings affected by load (use Adaptive Timeouts) */
 extern unsigned int obd_timeout;          /* seconds */
 extern unsigned int ldlm_timeout;         /* seconds */
-extern unsigned int adaptive_timeout_min; /* seconds */
 extern unsigned int adaptive_timeout_max; /* seconds */
 extern unsigned int adaptive_timeout_history; /* seconds */
 extern unsigned int obd_sync_filter;
@@ -60,9 +59,14 @@ extern unsigned int obd_alloc_fail_rate;
 #define PING_EVICT_TIMEOUT (PING_INTERVAL * 5 / 2)
 #define DISK_TIMEOUT 50          /* Beyond this we warn about disk speed */
 #define CONNECTION_SWITCH_MIN 5  /* Connection switching rate limiter */
+#define CONNECTION_SWITCH_MAX 50 /* Max connect interval for nonresponsive
+                                    servers; keep this within the recovery
+                                    period */
+#define CONNECTION_SWITCH_INC 5  /* Connection timeout backoff */
 #ifndef CRAY_XT3
 /* In general this should be low to have quick detection of a system 
-   running on a backup server. */
+   running on a backup server. (If it's too low, import_select_connection
+   will increase the timeout anyhow.)  */
 #define INITIAL_CONNECT_TIMEOUT max_t(int,CONNECTION_SWITCH_MIN,obd_timeout/20)
 #else
 /* ...but for very large systems (e.g. CRAY) we need to keep the initial 
@@ -71,7 +75,6 @@ extern unsigned int obd_alloc_fail_rate;
    chance to generate adaptive timeout data. */
 #define INITIAL_CONNECT_TIMEOUT max_t(int,CONNECTION_SWITCH_MIN,obd_timeout/2)
 #endif
-#define LND_TIMEOUT 50           /* LNET LND-level RPC timeout */
 #define LONG_UNLINK 300          /* Unlink should happen before now */
 
 
index 874798c..bad9801 100644 (file)
@@ -135,7 +135,7 @@ static int lov_rd_stripecount(char *page, char **start, off_t off, int count,
         LASSERT(dev != NULL);
         desc = &dev->u.lov.desc;
         *eof = 1;
-        return snprintf(page, count, "%u\n", desc->ld_default_stripe_count);
+        return snprintf(page, count, "%d\n", desc->ld_default_stripe_count);
 }
 
 static int lov_wr_stripecount(struct file *file, const char *buffer,
index 632d7e3..eb82860 100644 (file)
@@ -65,8 +65,6 @@ unsigned int obd_dump_on_timeout;
 unsigned int obd_dump_on_eviction;
 unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT;   /* seconds */
 unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */
-/* Covers the maximum expected network latency */
-unsigned int adaptive_timeout_min = 10;           /* seconds */
 unsigned int adaptive_timeout_max = 600;          /* seconds */
 /* We remember the slowest event that took place within history */
 unsigned int adaptive_timeout_history = 600;      /* seconds */
@@ -388,7 +386,6 @@ EXPORT_SYMBOL(obd_dump_on_timeout);
 EXPORT_SYMBOL(obd_dump_on_eviction);
 EXPORT_SYMBOL(obd_timeout);
 EXPORT_SYMBOL(ldlm_timeout);
-EXPORT_SYMBOL(adaptive_timeout_min);
 EXPORT_SYMBOL(adaptive_timeout_max);
 EXPORT_SYMBOL(adaptive_timeout_history);
 EXPORT_SYMBOL(obd_max_dirty_pages);
index 470cae7..81765de 100644 (file)
@@ -793,11 +793,7 @@ EXPORT_SYMBOL(class_import_put);
 
 static void init_imp_at(struct imp_at *at) {
         int i;
-        /* We need enough time to get an early response on a slow network.
-           Since we can't say for sure how slow a network might be, we use
-           a user-defined max expected network latency. We will adapt to slow
-           increases, but a sudden jump can still kill us. */
-        at_init(&at->iat_net_latency, adaptive_timeout_min, AT_FLG_MIN);
+        at_init(&at->iat_net_latency, CONNECTION_SWITCH_INC, 0);
         for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
                 /* max service estimates are tracked on the server side, so
                    don't use the AT history here, just use the last reported
@@ -805,7 +801,6 @@ static void init_imp_at(struct imp_at *at) {
                 at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT,
                         AT_FLG_NOHIST);
         }
-        at->iat_drain = 0;
 }
 
 struct obd_import *class_new_import(struct obd_device *obd)
index 85cba69..0d36662 100644 (file)
@@ -62,7 +62,6 @@ enum {
         OBD_DUMP_ON_EVICTION,   /* dump kernel debug log upon eviction */
         OBD_DEBUG_PEER_ON_TIMEOUT, /* dump peer debug when RPC times out */
         OBD_ALLOC_FAIL_RATE,    /* memory allocation random failure rate */
-        ADAPTIVE_MIN,           /* Adaptive timeout lower limit */
         ADAPTIVE_MAX,           /* Adaptive timeout upper limit */
         ADAPTIVE_HISTORY,       /* Adaptive timeout timebase */
 };
@@ -199,14 +198,6 @@ static cfs_sysctl_table_t obd_table[] = {
         },
 #endif
         {
-                .ctl_name = ADAPTIVE_MIN,
-                .procname = "adaptive_min",
-                .data     = &adaptive_timeout_min,
-                .maxlen   = sizeof(int),
-                .mode     = 0644,
-                .proc_handler = &proc_dointvec
-        },
-        {
                 .ctl_name = ADAPTIVE_MAX,
                 .procname = "adaptive_max",
                 .data     = &adaptive_timeout_max,
index 85e23ed..0eaddcd 100644 (file)
@@ -215,14 +215,6 @@ static void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req)
         /* We could get even fancier here, using history to predict increased
            loading... */
              
-        if (at->iat_drain > req->rq_timeout) {
-                /* If we're trying to drain the network queues, give this 
-                   req a long timeout */
-                req->rq_timeout = at->iat_drain;
-                CDEBUG(D_ADAPTTO, "waiting %ds to let queues drain\n",
-                       req->rq_timeout);
-        }
-
         /* Let the server know what this RPC timeout is by putting it in the 
            reqmsg*/
         lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
@@ -271,7 +263,7 @@ static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req)
         
         /* Network latency is total time less server processing time */
         nl = max_t(int, now - req->rq_sent - st, 0) + 1/*st rounding*/;
-        if (st > now - req->rq_sent + 1 /* rounding */) 
+        if (st > now - req->rq_sent + 2 /* rounding */)
                 CERROR("Reported service time %u > total measured time %ld\n",
                        st, now - req->rq_sent);
 
index a058f6b..327c2b7 100644 (file)
@@ -188,8 +188,7 @@ void ptlrpc_invalidate_import(struct obd_import *imp)
         struct list_head *tmp, *n;
         struct ptlrpc_request *req;
         struct l_wait_info lwi;
-        time_t last = 0;
-        int timeout, rc = 0;
+        int rc;
 
         atomic_inc(&imp->imp_inval_count);
 
@@ -198,28 +197,16 @@ void ptlrpc_invalidate_import(struct obd_import *imp)
 
         LASSERT(imp->imp_invalid);
 
-        /* wait for all requests to error out and call completion callbacks */
-        spin_lock(&imp->imp_lock);
-        list_for_each_safe(tmp, n, &imp->imp_sending_list) {
-                req = list_entry(tmp, struct ptlrpc_request, rq_list);
-                last = max(last, req->rq_deadline);
-        }
-        list_for_each_safe(tmp, n, &imp->imp_delayed_list) {
-                req = list_entry(tmp, struct ptlrpc_request, rq_list);
-                last = max(last, req->rq_deadline);
-        }
-        spin_unlock(&imp->imp_lock);
+        /* wait for all requests to error out and call completion callbacks.
+           Cap it at obd_timeout -- these should all have been locally
+           cancelled by ptlrpc_abort_inflight. */
+        lwi = LWI_TIMEOUT_INTERVAL(
+                cfs_timeout_cap(cfs_time_seconds(obd_timeout)),
+                cfs_time_seconds(1), NULL, NULL);
+        rc = l_wait_event(imp->imp_recovery_waitq,
+                          (atomic_read(&imp->imp_inflight) == 0), &lwi);
 
-        timeout = (int)(last - cfs_time_current_sec());
-        if (timeout > 0) {
-                lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(timeout),
-                                           cfs_time_seconds(1), NULL, NULL);
-                rc = l_wait_event(imp->imp_recovery_waitq,
-                                  (atomic_read(&imp->imp_inflight) == 0),
-                                  &lwi);
-        }
-
-        if (atomic_read(&imp->imp_inflight)) {
+        if (rc) {
                 CERROR("%s: rc = %d waiting for callback (%d != 0)\n",
                        obd2cli_tgt(imp->imp_obd), rc,
                        atomic_read(&imp->imp_inflight));
@@ -339,18 +326,20 @@ static int import_select_connection(struct obd_import *imp)
         LASSERT(imp_conn->oic_conn);
 
         /* If we've tried everything, and we're back to the beginning of the
-           list, wait for LND_TIMEOUT to give the queues a chance to drain. */
+           list, increase our timeout and try again. It will be reset when
+           we do finally connect. (FIXME: really we should wait for all network
+           state associated with the last connection attempt to drain before
+           trying to reconnect on it.) */
         if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) {
-                int must_wait;
+                if (at_get(&imp->imp_at.iat_net_latency) <
+                    CONNECTION_SWITCH_MAX) {
+                        at_add(&imp->imp_at.iat_net_latency,
+                               at_get(&imp->imp_at.iat_net_latency) +
+                               CONNECTION_SWITCH_INC);
+                }
                 LASSERT(imp_conn->oic_last_attempt);
-                must_wait = LND_TIMEOUT -
-                        (int)cfs_duration_sec(cfs_time_current_64() - 
-                                              imp_conn->oic_last_attempt);
-                imp->imp_at.iat_drain = max(0, must_wait);
-                CWARN("Tried all connections, %lus drain time\n",
-                      imp->imp_at.iat_drain);
-        } else {
-                imp->imp_at.iat_drain = 0;
+                CWARN("Tried all connections, increasing latency to %ds\n",
+                      at_get(&imp->imp_at.iat_net_latency));
         }
 
         imp_conn->oic_last_attempt = cfs_time_current_64();
@@ -568,7 +557,6 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
         ENTRY;
 
         spin_lock(&imp->imp_lock);
-        imp->imp_at.iat_drain = 0;
         if (imp->imp_state == LUSTRE_IMP_CLOSED) {
                 spin_unlock(&imp->imp_lock);
                 RETURN(0);
@@ -1174,10 +1162,6 @@ int at_add(struct adaptive_timeout *at, unsigned int val)
                 at->at_binstart += shift * binlimit;
         }
 
-        if ((at->at_flags & AT_FLG_MIN) && 
-            (at->at_current < adaptive_timeout_min))
-                at->at_current = adaptive_timeout_min;
-
         if (at->at_current > at->at_worst_ever) {
                 at->at_worst_ever = at->at_current;
                 at->at_worst_time = now;
index 6f49146..af74dc7 100644 (file)
@@ -67,11 +67,12 @@ int ptlrpc_ping(struct obd_import *imp)
 void ptlrpc_update_next_ping(struct obd_import *imp)
 {
 #ifdef ENABLE_PINGER
-        int time = (imp->imp_state != LUSTRE_IMP_DISCON) ? PING_INTERVAL :
-                /* FIXME should this be limited to LND_TIMEOUT so we don't
-                   build up pings in LND output queues? */
-                max_t(int, CONNECTION_SWITCH_MIN, 
-                      at_get(&imp->imp_at.iat_net_latency));
+        int time = PING_INTERVAL;
+        if (imp->imp_state == LUSTRE_IMP_DISCON) {
+                int dtime = max_t(int, CONNECTION_SWITCH_MIN,
+                                  at_get(&imp->imp_at.iat_net_latency));
+                time = min(time, dtime);
+        }
         imp->imp_next_ping = cfs_time_shift(time);
 #endif /* ENABLE_PINGER */
 }