configuration management server (MGS). The functionality also
allows to specify sets of clients for which the remapping does
not apply.
+
+Severity : normal
+Bugzilla : 16860
+Description: Excessive recovery window
+Details : With AT enabled, the recovery window can be excessively long (6000+
+ seconds). To address this problem, we no longer use
+ OBD_RECOVERY_FACTOR when extending the recovery window (the connect
+ timeout no longer depends on the service time, it is set to
+ INITIAL_CONNECT_TIMEOUT now) and clients report the old service
+ time via pb_service_time.
+
--------------------------------------------------------------------------------
2007-08-10 Cluster File Systems, Inc. <info@clusterfs.com>
static inline unsigned int at_timeout2est(unsigned int val)
{
- /* restore estimate value from timeout */
+ /* restore estimate value from timeout: e=4/5(t-5) */
LASSERT(val);
- return ((val - 1) / 5 * 4);
+ return (max((val << 2) / 5, 5U) - 4);
}
static inline void at_init(struct adaptive_timeout *at, int val, int flags) {
struct ptlrpc_request *req,
int new_client)
{
- int req_timeout = lustre_msg_get_timeout(req->rq_reqmsg);
+ int service_time = lustre_msg_get_service_time(req->rq_reqmsg);
- /* teach server about old server's estimates */
- if (!new_client)
+ if (!new_client && service_time)
+ /* Teach server about old server's estimates, as first guess
+ * at how long new requests will take. */
at_add(&req->rq_rqbd->rqbd_service->srv_at_estimate,
- at_timeout2est(req_timeout));
+ service_time);
check_and_start_recovery_timer(obd);
- req_timeout *= OBD_RECOVERY_FACTOR;
- if (req_timeout > obd->obd_recovery_timeout && !new_client)
- reset_recovery_timer(obd, req_timeout, 0);
+ /* convert the service time to rpc timeout,
+ * reuse service_time to limit stack usage */
+ service_time = at_est2timeout(service_time);
+
+ /* We expect other clients to timeout within service_time, then try
+ * to reconnect, then try the failover server. The max delay between
+ * connect attempts is SWITCH_MAX + SWITCH_INC + INITIAL */
+ service_time += 2 * (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC +
+ INITIAL_CONNECT_TIMEOUT);
+ if (service_time > obd->obd_recovery_timeout && !new_client)
+ reset_recovery_timer(obd, service_time, 0);
}
#ifdef __KERNEL__
if (!req_replay_done(req->rq_export) ||
!lock_replay_done(req->rq_export))
reset_recovery_timer(class_exp2obd(req->rq_export),
- OBD_RECOVERY_FACTOR * AT_OFF ? obd_timeout :
+ AT_OFF ? obd_timeout :
at_get(&req->rq_rqbd->rqbd_service->srv_at_estimate), 1);
ptlrpc_free_clone(req);
RETURN(0);
GOTO(out, rc);
}
+ /* Report the rpc service time to the server so that it knows how long
+ * to wait for clients to join recovery */
+ lustre_msg_set_service_time(request->rq_reqmsg,
+ at_timeout2est(request->rq_timeout));
+
+ /* The amount of time we give the server to process the connect req.
+ * import_select_connection will increase the net latency on
+ * repeated reconnect attempts to cover slow networks.
+ * We override/ignore the server rpc completion estimate here,
+ * which may be large if this is a reconnect attempt */
+ request->rq_timeout = INITIAL_CONNECT_TIMEOUT;
+ lustre_msg_set_timeout(request->rq_reqmsg, request->rq_timeout);
+
#ifndef __KERNEL__
lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_LIBCLIENT);
#endif
spin_unlock(&imp->imp_lock);
lustre_msg_add_op_flags(request->rq_reqmsg,
MSG_CONNECT_INITIAL);
- if (AT_OFF)
- /* AT will use INITIAL_CONNECT_TIMEOUT the first
- time, adaptive after that. */
- request->rq_timeout = INITIAL_CONNECT_TIMEOUT;
}
if (set_transno)