#define D_ADAPTTO D_OTHER
#define AT_BINS 4 /* "bin" means "N seconds of history" */
#define AT_FLG_NOHIST 0x1 /* use last reported value only */
-#define AT_FLG_MIN 0x2 /* use a minimum limit */
struct adaptive_timeout {
time_t at_binstart; /* bin start time */
int iat_portal[IMP_AT_MAX_PORTALS];
struct adaptive_timeout iat_net_latency;
struct adaptive_timeout iat_service_estimate[IMP_AT_MAX_PORTALS];
- time_t iat_drain; /* hack to slow reconnect reqs */
};
struct obd_import {
networking / disk / timings affected by load (use Adaptive Timeouts) */
extern unsigned int obd_timeout; /* seconds */
extern unsigned int ldlm_timeout; /* seconds */
-extern unsigned int adaptive_timeout_min; /* seconds */
extern unsigned int adaptive_timeout_max; /* seconds */
extern unsigned int adaptive_timeout_history; /* seconds */
extern unsigned int obd_sync_filter;
#define PING_EVICT_TIMEOUT (PING_INTERVAL * 5 / 2)
#define DISK_TIMEOUT 50 /* Beyond this we warn about disk speed */
#define CONNECTION_SWITCH_MIN 5 /* Connection switching rate limiter */
+#define CONNECTION_SWITCH_MAX 50 /* Max connect interval for nonresponsive
+ servers; keep this within the recovery
+ period */
+#define CONNECTION_SWITCH_INC 5 /* Connection timeout backoff */
#ifndef CRAY_XT3
/* In general this should be low to have quick detection of a system
- running on a backup server. */
+ running on a backup server. (If it's too low, import_select_connection
+ will increase the timeout anyhow.) */
#define INITIAL_CONNECT_TIMEOUT max_t(int,CONNECTION_SWITCH_MIN,obd_timeout/20)
#else
/* ...but for very large systems (e.g. CRAY) we need to keep the initial
chance to generate adaptive timeout data. */
#define INITIAL_CONNECT_TIMEOUT max_t(int,CONNECTION_SWITCH_MIN,obd_timeout/2)
#endif
-#define LND_TIMEOUT 50 /* LNET LND-level RPC timeout */
#define LONG_UNLINK 300 /* Unlink should happen before now */
LASSERT(dev != NULL);
desc = &dev->u.lov.desc;
*eof = 1;
- return snprintf(page, count, "%u\n", desc->ld_default_stripe_count);
+ return snprintf(page, count, "%d\n", desc->ld_default_stripe_count);
}
static int lov_wr_stripecount(struct file *file, const char *buffer,
unsigned int obd_dump_on_eviction;
unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT; /* seconds */
unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */
-/* Covers the maximum expected network latency */
-unsigned int adaptive_timeout_min = 10; /* seconds */
unsigned int adaptive_timeout_max = 600; /* seconds */
/* We remember the slowest event that took place within history */
unsigned int adaptive_timeout_history = 600; /* seconds */
EXPORT_SYMBOL(obd_dump_on_eviction);
EXPORT_SYMBOL(obd_timeout);
EXPORT_SYMBOL(ldlm_timeout);
-EXPORT_SYMBOL(adaptive_timeout_min);
EXPORT_SYMBOL(adaptive_timeout_max);
EXPORT_SYMBOL(adaptive_timeout_history);
EXPORT_SYMBOL(obd_max_dirty_pages);
static void init_imp_at(struct imp_at *at) {
int i;
- /* We need enough time to get an early response on a slow network.
- Since we can't say for sure how slow a network might be, we use
- a user-defined max expected network latency. We will adapt to slow
- increases, but a sudden jump can still kill us. */
- at_init(&at->iat_net_latency, adaptive_timeout_min, AT_FLG_MIN);
+ at_init(&at->iat_net_latency, CONNECTION_SWITCH_INC, 0);
for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
/* max service estimates are tracked on the server side, so
don't use the AT history here, just use the last reported
at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT,
AT_FLG_NOHIST);
}
- at->iat_drain = 0;
}
struct obd_import *class_new_import(struct obd_device *obd)
OBD_DUMP_ON_EVICTION, /* dump kernel debug log upon eviction */
OBD_DEBUG_PEER_ON_TIMEOUT, /* dump peer debug when RPC times out */
OBD_ALLOC_FAIL_RATE, /* memory allocation random failure rate */
- ADAPTIVE_MIN, /* Adaptive timeout lower limit */
ADAPTIVE_MAX, /* Adaptive timeout upper limit */
ADAPTIVE_HISTORY, /* Adaptive timeout timebase */
};
},
#endif
{
- .ctl_name = ADAPTIVE_MIN,
- .procname = "adaptive_min",
- .data = &adaptive_timeout_min,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec
- },
- {
.ctl_name = ADAPTIVE_MAX,
.procname = "adaptive_max",
.data = &adaptive_timeout_max,
/* We could get even fancier here, using history to predict increased
loading... */
- if (at->iat_drain > req->rq_timeout) {
- /* If we're trying to drain the network queues, give this
- req a long timeout */
- req->rq_timeout = at->iat_drain;
- CDEBUG(D_ADAPTTO, "waiting %ds to let queues drain\n",
- req->rq_timeout);
- }
-
/* Let the server know what this RPC timeout is by putting it in the
reqmsg*/
lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
/* Network latency is total time less server processing time */
nl = max_t(int, now - req->rq_sent - st, 0) + 1/*st rounding*/;
- if (st > now - req->rq_sent + 1 /* rounding */)
+ if (st > now - req->rq_sent + 2 /* rounding */)
CERROR("Reported service time %u > total measured time %ld\n",
st, now - req->rq_sent);
struct list_head *tmp, *n;
struct ptlrpc_request *req;
struct l_wait_info lwi;
- time_t last = 0;
- int timeout, rc = 0;
+ int rc;
atomic_inc(&imp->imp_inval_count);
LASSERT(imp->imp_invalid);
- /* wait for all requests to error out and call completion callbacks */
- spin_lock(&imp->imp_lock);
- list_for_each_safe(tmp, n, &imp->imp_sending_list) {
- req = list_entry(tmp, struct ptlrpc_request, rq_list);
- last = max(last, req->rq_deadline);
- }
- list_for_each_safe(tmp, n, &imp->imp_delayed_list) {
- req = list_entry(tmp, struct ptlrpc_request, rq_list);
- last = max(last, req->rq_deadline);
- }
- spin_unlock(&imp->imp_lock);
+ /* wait for all requests to error out and call completion callbacks.
+ Cap it at obd_timeout -- these should all have been locally
+ cancelled by ptlrpc_abort_inflight. */
+ lwi = LWI_TIMEOUT_INTERVAL(
+ cfs_timeout_cap(cfs_time_seconds(obd_timeout)),
+ cfs_time_seconds(1), NULL, NULL);
+ rc = l_wait_event(imp->imp_recovery_waitq,
+ (atomic_read(&imp->imp_inflight) == 0), &lwi);
- timeout = (int)(last - cfs_time_current_sec());
- if (timeout > 0) {
- lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(timeout),
- cfs_time_seconds(1), NULL, NULL);
- rc = l_wait_event(imp->imp_recovery_waitq,
- (atomic_read(&imp->imp_inflight) == 0),
- &lwi);
- }
-
- if (atomic_read(&imp->imp_inflight)) {
+ if (rc) {
CERROR("%s: rc = %d waiting for callback (%d != 0)\n",
obd2cli_tgt(imp->imp_obd), rc,
atomic_read(&imp->imp_inflight));
LASSERT(imp_conn->oic_conn);
/* If we've tried everything, and we're back to the beginning of the
- list, wait for LND_TIMEOUT to give the queues a chance to drain. */
+ list, increase our timeout and try again. It will be reset when
+ we do finally connect. (FIXME: really we should wait for all network
+ state associated with the last connection attempt to drain before
+ trying to reconnect on it.) */
if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) {
- int must_wait;
+ if (at_get(&imp->imp_at.iat_net_latency) <
+ CONNECTION_SWITCH_MAX) {
+ at_add(&imp->imp_at.iat_net_latency,
+ at_get(&imp->imp_at.iat_net_latency) +
+ CONNECTION_SWITCH_INC);
+ }
LASSERT(imp_conn->oic_last_attempt);
- must_wait = LND_TIMEOUT -
- (int)cfs_duration_sec(cfs_time_current_64() -
- imp_conn->oic_last_attempt);
- imp->imp_at.iat_drain = max(0, must_wait);
- CWARN("Tried all connections, %lus drain time\n",
- imp->imp_at.iat_drain);
- } else {
- imp->imp_at.iat_drain = 0;
+ CWARN("Tried all connections, increasing latency to %ds\n",
+ at_get(&imp->imp_at.iat_net_latency));
}
imp_conn->oic_last_attempt = cfs_time_current_64();
ENTRY;
spin_lock(&imp->imp_lock);
- imp->imp_at.iat_drain = 0;
if (imp->imp_state == LUSTRE_IMP_CLOSED) {
spin_unlock(&imp->imp_lock);
RETURN(0);
at->at_binstart += shift * binlimit;
}
- if ((at->at_flags & AT_FLG_MIN) &&
- (at->at_current < adaptive_timeout_min))
- at->at_current = adaptive_timeout_min;
-
if (at->at_current > at->at_worst_ever) {
at->at_worst_ever = at->at_current;
at->at_worst_time = now;
void ptlrpc_update_next_ping(struct obd_import *imp)
{
#ifdef ENABLE_PINGER
- int time = (imp->imp_state != LUSTRE_IMP_DISCON) ? PING_INTERVAL :
- /* FIXME should this be limited to LND_TIMEOUT so we don't
- build up pings in LND output queues? */
- max_t(int, CONNECTION_SWITCH_MIN,
- at_get(&imp->imp_at.iat_net_latency));
+ int time = PING_INTERVAL;
+ if (imp->imp_state == LUSTRE_IMP_DISCON) {
+ int dtime = max_t(int, CONNECTION_SWITCH_MIN,
+ at_get(&imp->imp_at.iat_net_latency));
+ time = min(time, dtime);
+ }
imp->imp_next_ping = cfs_time_shift(time);
#endif /* ENABLE_PINGER */
}