From: Mikhail Pershin Date: Tue, 20 Nov 2012 18:01:23 +0000 (+0400) Subject: LU-2104 recovery: keep valid obd_stale_clients during recovery X-Git-Tag: 2.3.59~71 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=35dc2e39288f15abab2d6bf7bb3f572d9670521c;ds=sidebyside LU-2104 recovery: keep valid obd_stale_clients during recovery - in class_fail_export() update obd_stale_clients counter if recovery is in progress. That fixes endless recovery issues. - class_disconnect_stale_exports() should check exp_failed and exclude such exports from being evicted once more. - set obd_no_conn flag in ofd_prepare() like mdt does. - remove check for obd_no_conn from check_for_clients(). That is extra check as we don't allow any client to connect while obd_no_conn is set Signed-off-by: Mikhail Pershin Change-Id: I439a1edd49a9c51f42df4c423d7c165468a3d38d Reviewed-on: http://review.whamcloud.com/4636 Tested-by: Hudson Tested-by: Maloo Reviewed-by: Alex Zhuravlev Reviewed-by: Andreas Dilger --- diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 9428064..7be8ebc 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -1039,23 +1039,25 @@ no_export: /* allow lightweight connections during recovery */ if (target->obd_recovering && !lw_client) { cfs_time_t t; - int c; /* connected */ - int i; /* in progress */ - int k; /* known */ + int c; /* connected */ + int i; /* in progress */ + int k; /* known */ + int s; /* stale/evicted */ c = cfs_atomic_read(&target->obd_connected_clients); i = cfs_atomic_read(&target->obd_lock_replay_clients); k = target->obd_max_recoverable_clients; + s = target->obd_stale_clients; t = cfs_timer_deadline(&target->obd_recovery_timer); t = cfs_time_sub(t, cfs_time_current()); t = cfs_duration_sec(t); LCONSOLE_WARN("%s: Denying connection for new client " "%s (at %s), waiting for all %d known " "clients (%d recovered, %d in progress, " - "and %d unseen) to recover in %d:%.02d\n", + "and %d evicted) to recover in %d:%.02d\n", target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid), k, - c - i, i, k - c, (int)t / 60, + c - i, i, s, (int)t / 60, (int)t % 60); rc = -EBUSY; } else { @@ -1602,7 +1604,8 @@ static void extend_recovery_timer(struct obd_device *obd, int drt, bool extend) if (to > obd->obd_recovery_time_hard) to = obd->obd_recovery_time_hard; - if (obd->obd_recovery_timeout < to) { + if (obd->obd_recovery_timeout < to || + obd->obd_recovery_timeout == obd->obd_recovery_time_hard) { obd->obd_recovery_timeout = to; cfs_timer_arm(&obd->obd_recovery_timer, cfs_time_shift(drt)); @@ -1696,10 +1699,8 @@ static int check_for_clients(struct obd_device *obd) if (obd->obd_abort_recovery || obd->obd_recovery_expired) return 1; LASSERT(clnts <= obd->obd_max_recoverable_clients); - if (obd->obd_no_conn == 0 && - clnts + obd->obd_stale_clients == obd->obd_max_recoverable_clients) - return 1; - return 0; + return (clnts + obd->obd_stale_clients == + obd->obd_max_recoverable_clients); } static int check_for_next_transno(struct obd_device *obd) diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index 75f19fd..f495084 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -1133,12 +1133,21 @@ void class_export_recovery_cleanup(struct obd_export *exp) spin_lock(&obd->obd_recovery_task_lock); if (exp->exp_delayed) obd->obd_delayed_clients--; - if (obd->obd_recovering && exp->exp_in_recovery) { - spin_lock(&exp->exp_lock); - exp->exp_in_recovery = 0; - spin_unlock(&exp->exp_lock); - LASSERT_ATOMIC_POS(&obd->obd_connected_clients); - cfs_atomic_dec(&obd->obd_connected_clients); + if (obd->obd_recovering) { + if (exp->exp_in_recovery) { + spin_lock(&exp->exp_lock); + exp->exp_in_recovery = 0; + spin_unlock(&exp->exp_lock); + LASSERT_ATOMIC_POS(&obd->obd_connected_clients); + cfs_atomic_dec(&obd->obd_connected_clients); + } + + /* if called during recovery then should update + * obd_stale_clients counter, + * lightweight exports are not counted */ + if (exp->exp_failed && + (exp->exp_connect_flags & OBD_CONNECT_LIGHTWEIGHT) == 0) + exp->exp_obd->obd_stale_clients++; } spin_unlock(&obd->obd_recovery_task_lock); /** Cleanup req replay fields */ @@ -1148,7 +1157,7 @@ void class_export_recovery_cleanup(struct obd_export *exp) spin_unlock(&exp->exp_lock); LASSERT(cfs_atomic_read(&obd->obd_req_replay_clients)); cfs_atomic_dec(&obd->obd_req_replay_clients); - } + } /** Cleanup lock replay data */ if (exp->exp_lock_replay_needed) { spin_lock(&exp->exp_lock); @@ -1313,7 +1322,7 @@ void class_disconnect_stale_exports(struct obd_device *obd, continue; spin_lock(&exp->exp_lock); - if (test_export(exp)) { + if (exp->exp_failed || test_export(exp)) { spin_unlock(&exp->exp_lock); continue; } @@ -1330,14 +1339,13 @@ void class_disconnect_stale_exports(struct obd_device *obd, } spin_unlock(&obd->obd_dev_lock); - if (evicted) { - LCONSOLE_WARN("%s: disconnecting %d stale clients\n", - obd->obd_name, evicted); - obd->obd_stale_clients += evicted; - } - class_disconnect_export_list(&work_list, exp_flags_from_obd(obd) | - OBD_OPT_ABORT_RECOV); - EXIT; + if (evicted) + LCONSOLE_WARN("%s: disconnecting %d stale clients\n", + obd->obd_name, evicted); + + class_disconnect_export_list(&work_list, exp_flags_from_obd(obd) | + OBD_OPT_ABORT_RECOV); + EXIT; } EXPORT_SYMBOL(class_disconnect_stale_exports); diff --git a/lustre/ofd/ofd_dev.c b/lustre/ofd/ofd_dev.c index f4b0bec..1f7bf8e 100644 --- a/lustre/ofd/ofd_dev.c +++ b/lustre/ofd/ofd_dev.c @@ -356,6 +356,10 @@ static int ofd_prepare(const struct lu_env *env, struct lu_device *pdev, rc = next->ld_ops->ldo_prepare(env, dev, next); target_recovery_init(&ofd->ofd_lut, ost_handle); + LASSERT(obd->obd_no_conn); + spin_lock(&obd->obd_dev_lock); + obd->obd_no_conn = 0; + spin_unlock(&obd->obd_dev_lock); if (obd->obd_recovering == 0) ofd_postrecov(env, ofd); diff --git a/lustre/ofd/ofd_obd.c b/lustre/ofd/ofd_obd.c index 48e28a4..d218018 100644 --- a/lustre/ofd/ofd_obd.c +++ b/lustre/ofd/ofd_obd.c @@ -1457,23 +1457,6 @@ out: return !!rc; } -static int ofd_obd_notify(struct obd_device *obd, struct obd_device *unused, - enum obd_notify_event ev, void *data) -{ - switch (ev) { - case OBD_NOTIFY_CONFIG: - LASSERT(obd->obd_no_conn); - spin_lock(&obd->obd_dev_lock); - obd->obd_no_conn = 0; - spin_unlock(&obd->obd_dev_lock); - break; - default: - CDEBUG(D_INFO, "%s: Unhandled notification %#x\n", - obd->obd_name, ev); - } - return 0; -} - /* * Handle quota control requests to consult current usage/limit. * @@ -1527,6 +1510,5 @@ struct obd_ops ofd_obd_ops = { .o_precleanup = ofd_precleanup, .o_ping = ofd_ping, .o_health_check = ofd_health_check, - .o_notify = ofd_obd_notify, .o_quotactl = ofd_quotactl, };