From 8da33c6cc5192303fcd18f45892e1f115004e662 Mon Sep 17 00:00:00 2001 From: Jinshan Xiong Date: Wed, 26 Oct 2011 23:52:43 -0600 Subject: [PATCH] ORNL-28 recovery: rework extend_recovery_timer() extend_recovery_timer() is used to adjust timeout value of a recovering target. In the original implementation, there was a problem it stopped the target from firing a timer again for version recovery case. Change-Id: I815a15fb5d3104e52a189eed1529c58d7a8d03b9 Signed-off-by: Jinshan Xiong Reviewed-on: http://review.whamcloud.com/1620 Tested-by: Hudson Tested-by: Maloo Reviewed-by: Mikhail Pershin Reviewed-by: Oleg Drokin --- lustre/include/lustre_lib.h | 1 - lustre/ldlm/ldlm_lib.c | 74 ++++++++++++++++++++++++++++++--------------- lustre/mdt/mdt_handler.c | 3 -- lustre/obdfilter/filter.c | 20 +++++------- 4 files changed, 57 insertions(+), 41 deletions(-) diff --git a/lustre/include/lustre_lib.h b/lustre/include/lustre_lib.h index de7b96c..6283648 100644 --- a/lustre/include/lustre_lib.h +++ b/lustre/include/lustre_lib.h @@ -97,7 +97,6 @@ int target_handle_dqacq_callback(struct ptlrpc_request *req); struct l_wait_info; -void target_start_recovery_timer(struct obd_device *obd); void target_cancel_recovery_timer(struct obd_device *obd); void target_stop_recovery_thread(struct obd_device *obd); void target_cleanup_recovery(struct obd_device *obd); diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 996da3e..3b96a4b 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -674,12 +674,12 @@ EXPORT_SYMBOL(target_client_add_cb); #ifdef __KERNEL__ static void -check_and_extend_recovery_timer(struct obd_device *obd, - struct ptlrpc_request *req); +check_and_start_recovery_timer(struct obd_device *obd, + struct ptlrpc_request *req, int new_client); #else static inline void -check_and_extend_recovery_timer(struct obd_device *obd, - struct ptlrpc_request *req) +check_and_start_recovery_timer(struct obd_device *obd, + struct ptlrpc_request *req, int new_client) { } #endif @@ -910,8 +910,8 @@ no_export: /* If this is the first time a client connects, reset the recovery * timer */ - if (rc == 0 && target->obd_recovering && export) - check_and_extend_recovery_timer(target, req); + if (rc == 0 && target->obd_recovering) + check_and_start_recovery_timer(target, req, export == NULL); /* We want to handle EALREADY but *not* -EALREADY from * target_handle_reconnect(), return reconnection state in a flag */ @@ -1375,15 +1375,20 @@ void target_cancel_recovery_timer(struct obd_device *obd) cfs_timer_disarm(&obd->obd_recovery_timer); } -void target_start_recovery_timer(struct obd_device *obd) +static void target_start_recovery_timer(struct obd_device *obd) { + if (obd->obd_recovery_start != 0) + return; + cfs_spin_lock(&obd->obd_dev_lock); if (!obd->obd_recovering || obd->obd_abort_recovery) { cfs_spin_unlock(&obd->obd_dev_lock); return; } - if (cfs_timer_is_armed(&obd->obd_recovery_timer)) { + LASSERT(obd->obd_recovery_timeout != 0); + + if (obd->obd_recovery_start != 0) { cfs_spin_unlock(&obd->obd_dev_lock); return; } @@ -1392,31 +1397,49 @@ void target_start_recovery_timer(struct obd_device *obd) cfs_time_shift(obd->obd_recovery_timeout)); obd->obd_recovery_start = cfs_time_current_sec(); cfs_spin_unlock(&obd->obd_dev_lock); - CDEBUG(D_HA, "%s: starting recovery timer\n", obd->obd_name); + + LCONSOLE_WARN("%s: Will be in recovery for at least %d:%.02d, " + "or until %d client%s reconnect%s\n", + obd->obd_name, + obd->obd_recovery_timeout / 60, + obd->obd_recovery_timeout % 60, + obd->obd_max_recoverable_clients, + (obd->obd_max_recoverable_clients == 1) ? "" : "s", + (obd->obd_max_recoverable_clients == 1) ? "s": ""); } -EXPORT_SYMBOL(target_start_recovery_timer); /* extend recovery window to have extra @duration seconds at least. */ static void extend_recovery_timer(struct obd_device *obd, int drt) { - cfs_time_t now = cfs_time_current_sec(); + cfs_time_t now; + cfs_time_t end; cfs_duration_t left; - if (!cfs_timer_is_armed(&obd->obd_recovery_timer)) { - cfs_spin_lock(&obd->obd_dev_lock); - if (obd->obd_recovery_timeout < drt) - obd->obd_recovery_timeout = drt; + cfs_spin_lock(&obd->obd_dev_lock); + if (!obd->obd_recovering || obd->obd_abort_recovery) { cfs_spin_unlock(&obd->obd_dev_lock); return; } - left = obd->obd_recovery_timeout; - left -= cfs_time_sub(now, obd->obd_recovery_start); - if (drt > left) { - cfs_timer_arm(&obd->obd_recovery_timer, cfs_time_shift(drt)); - CDEBUG(D_HA, "%s: recovery timer will expire in %u seconds\n", - obd->obd_name, (unsigned)drt); + LASSERT(obd->obd_recovery_start != 0); + + now = cfs_time_current_sec(); + end = obd->obd_recovery_start + obd->obd_recovery_timeout; + left = cfs_time_sub(end, now); + if (left < 0) { + obd->obd_recovery_timeout += drt - left; + } else if (left < drt) { + drt -= left; + obd->obd_recovery_timeout += drt; + } else { + drt = left; } + + cfs_timer_arm(&obd->obd_recovery_timer, cfs_time_shift(drt)); + cfs_spin_unlock(&obd->obd_dev_lock); + + CDEBUG(D_HA, "%s: recovery timer will expire in %u seconds\n", + obd->obd_name, (unsigned)drt); } /* Reset the timer with each new client connection */ @@ -1431,19 +1454,22 @@ static void extend_recovery_timer(struct obd_device *obd, int drt) */ static void -check_and_extend_recovery_timer(struct obd_device *obd, - struct ptlrpc_request *req) +check_and_start_recovery_timer(struct obd_device *obd, + struct ptlrpc_request *req, + int new_client) { int service_time = lustre_msg_get_service_time(req->rq_reqmsg); struct obd_device_target *obt = &obd->u.obt; struct lustre_sb_info *lsi; - if (service_time) + if (!new_client && service_time) /* Teach server about old server's estimates, as first guess * at how long new requests will take. */ at_measured(&req->rq_rqbd->rqbd_service->srv_at_estimate, service_time); + target_start_recovery_timer(obd); + /* convert the service time to rpc timeout, * reuse service_time to limit stack usage */ service_time = at_est2timeout(service_time); diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index 8422474..e94ade4 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -5306,9 +5306,6 @@ static int mdt_obd_notify(struct obd_device *obd, switch (ev) { case OBD_NOTIFY_CONFIG: - /* reset recovery timeout in case it has already started */ - target_start_recovery_timer(obd); - mdt_allow_cli(mdt, (unsigned long)data); #ifdef HAVE_QUOTA_SUPPORT diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index 0dd323b..6f2b7b9 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -2014,6 +2014,9 @@ int filter_common_setup(struct obd_device *obd, struct lustre_cfg* lcfg, /* failover is the default */ obd->obd_replayable = 1; + /* disable connection until configuration finishes */ + obd->obd_no_conn = 1; + if (lcfg->lcfg_bufcount > 3 && LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) { str = lustre_cfg_string(lcfg, 3); if (strchr(str, 'n')) { @@ -2121,17 +2124,6 @@ int filter_common_setup(struct obd_device *obd, struct lustre_cfg* lcfg, lmi ? s2lsi(lmi->lmi_sb)->lsi_lmd->lmd_dev : "", obd->obd_replayable ? "enabled" : "disabled"); - if (obd->obd_recovering) - LCONSOLE_WARN("%s: Will be in recovery for at least %d:%.02d, " - "or until %d client%s reconnect%s\n", - obd->obd_name, - obd->obd_recovery_timeout / 60, - obd->obd_recovery_timeout % 60, - obd->obd_max_recoverable_clients, - (obd->obd_max_recoverable_clients == 1) ? "" : "s", - (obd->obd_max_recoverable_clients == 1) ? "s": ""); - - RETURN(0); err_post: @@ -4697,8 +4689,10 @@ static int filter_notify(struct obd_device *obd, { switch (ev) { case OBD_NOTIFY_CONFIG: - /* reset recovery timeout in case it has already started */ - target_start_recovery_timer(obd); + LASSERT(obd->obd_no_conn); + cfs_spin_lock(&obd->obd_dev_lock); + obd->obd_no_conn = 0; + cfs_spin_unlock(&obd->obd_dev_lock); break; default: CDEBUG(D_INFO, "%s: Unhandled notification %#x\n", -- 1.8.3.1