Whamcloud - gitweb
LU-2104 recovery: keep valid obd_stale_clients during recovery
authorMikhail Pershin <tappro@whamcloud.com>
Tue, 20 Nov 2012 18:01:23 +0000 (22:01 +0400)
committerOleg Drokin <green@whamcloud.com>
Tue, 8 Jan 2013 04:56:07 +0000 (23:56 -0500)
- in class_fail_export() update obd_stale_clients counter if
  recovery is in progress. That fixes endless recovery issues.
- class_disconnect_stale_exports() should check exp_failed and
  exclude such exports from being evicted once more.
- set obd_no_conn flag in ofd_prepare() like mdt does.
- remove check for obd_no_conn from check_for_clients(). That is
  extra check as we don't allow any client to connect while
  obd_no_conn is set

Signed-off-by: Mikhail Pershin <tappro@whamcloud.com>
Change-Id: I439a1edd49a9c51f42df4c423d7c165468a3d38d
Reviewed-on: http://review.whamcloud.com/4636
Tested-by: Hudson
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
lustre/ldlm/ldlm_lib.c
lustre/obdclass/genops.c
lustre/ofd/ofd_dev.c
lustre/ofd/ofd_obd.c

index 9428064..7be8ebc 100644 (file)
@@ -1039,23 +1039,25 @@ no_export:
                /* allow lightweight connections during recovery */
                if (target->obd_recovering && !lw_client) {
                         cfs_time_t t;
                /* allow lightweight connections during recovery */
                if (target->obd_recovering && !lw_client) {
                         cfs_time_t t;
-                       int        c; /* connected */
-                       int        i; /* in progress */
-                       int        k; /* known */
+                       int     c; /* connected */
+                       int     i; /* in progress */
+                       int     k; /* known */
+                       int     s; /* stale/evicted */
 
                        c = cfs_atomic_read(&target->obd_connected_clients);
                        i = cfs_atomic_read(&target->obd_lock_replay_clients);
                        k = target->obd_max_recoverable_clients;
 
                        c = cfs_atomic_read(&target->obd_connected_clients);
                        i = cfs_atomic_read(&target->obd_lock_replay_clients);
                        k = target->obd_max_recoverable_clients;
+                       s = target->obd_stale_clients;
                        t = cfs_timer_deadline(&target->obd_recovery_timer);
                        t = cfs_time_sub(t, cfs_time_current());
                        t = cfs_duration_sec(t);
                        LCONSOLE_WARN("%s: Denying connection for new client "
                                      "%s (at %s), waiting for all %d known "
                                      "clients (%d recovered, %d in progress, "
                        t = cfs_timer_deadline(&target->obd_recovery_timer);
                        t = cfs_time_sub(t, cfs_time_current());
                        t = cfs_duration_sec(t);
                        LCONSOLE_WARN("%s: Denying connection for new client "
                                      "%s (at %s), waiting for all %d known "
                                      "clients (%d recovered, %d in progress, "
-                                     "and %d unseen) to recover in %d:%.02d\n",
+                                     "and %d evicted) to recover in %d:%.02d\n",
                                      target->obd_name, cluuid.uuid,
                                      libcfs_nid2str(req->rq_peer.nid), k,
                                      target->obd_name, cluuid.uuid,
                                      libcfs_nid2str(req->rq_peer.nid), k,
-                                     c - i, i, k - c, (int)t / 60,
+                                     c - i, i, s, (int)t / 60,
                                      (int)t % 60);
                         rc = -EBUSY;
                 } else {
                                      (int)t % 60);
                         rc = -EBUSY;
                 } else {
@@ -1602,7 +1604,8 @@ static void extend_recovery_timer(struct obd_device *obd, int drt, bool extend)
 
         if (to > obd->obd_recovery_time_hard)
                 to = obd->obd_recovery_time_hard;
 
         if (to > obd->obd_recovery_time_hard)
                 to = obd->obd_recovery_time_hard;
-        if (obd->obd_recovery_timeout < to) {
+       if (obd->obd_recovery_timeout < to ||
+           obd->obd_recovery_timeout == obd->obd_recovery_time_hard) {
                 obd->obd_recovery_timeout = to;
                 cfs_timer_arm(&obd->obd_recovery_timer,
                               cfs_time_shift(drt));
                 obd->obd_recovery_timeout = to;
                 cfs_timer_arm(&obd->obd_recovery_timer,
                               cfs_time_shift(drt));
@@ -1696,10 +1699,8 @@ static int check_for_clients(struct obd_device *obd)
         if (obd->obd_abort_recovery || obd->obd_recovery_expired)
                 return 1;
         LASSERT(clnts <= obd->obd_max_recoverable_clients);
         if (obd->obd_abort_recovery || obd->obd_recovery_expired)
                 return 1;
         LASSERT(clnts <= obd->obd_max_recoverable_clients);
-        if (obd->obd_no_conn == 0 &&
-            clnts + obd->obd_stale_clients == obd->obd_max_recoverable_clients)
-                return 1;
-        return 0;
+       return (clnts + obd->obd_stale_clients ==
+               obd->obd_max_recoverable_clients);
 }
 
 static int check_for_next_transno(struct obd_device *obd)
 }
 
 static int check_for_next_transno(struct obd_device *obd)
index 75f19fd..f495084 100644 (file)
@@ -1133,12 +1133,21 @@ void class_export_recovery_cleanup(struct obd_export *exp)
        spin_lock(&obd->obd_recovery_task_lock);
        if (exp->exp_delayed)
                obd->obd_delayed_clients--;
        spin_lock(&obd->obd_recovery_task_lock);
        if (exp->exp_delayed)
                obd->obd_delayed_clients--;
-       if (obd->obd_recovering && exp->exp_in_recovery) {
-               spin_lock(&exp->exp_lock);
-               exp->exp_in_recovery = 0;
-               spin_unlock(&exp->exp_lock);
-               LASSERT_ATOMIC_POS(&obd->obd_connected_clients);
-               cfs_atomic_dec(&obd->obd_connected_clients);
+       if (obd->obd_recovering) {
+               if (exp->exp_in_recovery) {
+                       spin_lock(&exp->exp_lock);
+                       exp->exp_in_recovery = 0;
+                       spin_unlock(&exp->exp_lock);
+                       LASSERT_ATOMIC_POS(&obd->obd_connected_clients);
+                       cfs_atomic_dec(&obd->obd_connected_clients);
+               }
+
+               /* if called during recovery then should update
+                * obd_stale_clients counter,
+                * lightweight exports are not counted */
+               if (exp->exp_failed &&
+                   (exp->exp_connect_flags & OBD_CONNECT_LIGHTWEIGHT) == 0)
+                       exp->exp_obd->obd_stale_clients++;
        }
        spin_unlock(&obd->obd_recovery_task_lock);
        /** Cleanup req replay fields */
        }
        spin_unlock(&obd->obd_recovery_task_lock);
        /** Cleanup req replay fields */
@@ -1148,7 +1157,7 @@ void class_export_recovery_cleanup(struct obd_export *exp)
                spin_unlock(&exp->exp_lock);
                LASSERT(cfs_atomic_read(&obd->obd_req_replay_clients));
                cfs_atomic_dec(&obd->obd_req_replay_clients);
                spin_unlock(&exp->exp_lock);
                LASSERT(cfs_atomic_read(&obd->obd_req_replay_clients));
                cfs_atomic_dec(&obd->obd_req_replay_clients);
-        }
+       }
        /** Cleanup lock replay data */
        if (exp->exp_lock_replay_needed) {
                spin_lock(&exp->exp_lock);
        /** Cleanup lock replay data */
        if (exp->exp_lock_replay_needed) {
                spin_lock(&exp->exp_lock);
@@ -1313,7 +1322,7 @@ void class_disconnect_stale_exports(struct obd_device *obd,
                        continue;
 
                spin_lock(&exp->exp_lock);
                        continue;
 
                spin_lock(&exp->exp_lock);
-               if (test_export(exp)) {
+               if (exp->exp_failed || test_export(exp)) {
                        spin_unlock(&exp->exp_lock);
                        continue;
                }
                        spin_unlock(&exp->exp_lock);
                        continue;
                }
@@ -1330,14 +1339,13 @@ void class_disconnect_stale_exports(struct obd_device *obd,
         }
        spin_unlock(&obd->obd_dev_lock);
 
         }
        spin_unlock(&obd->obd_dev_lock);
 
-        if (evicted) {
-                LCONSOLE_WARN("%s: disconnecting %d stale clients\n",
-                              obd->obd_name, evicted);
-                obd->obd_stale_clients += evicted;
-        }
-        class_disconnect_export_list(&work_list, exp_flags_from_obd(obd) |
-                                                 OBD_OPT_ABORT_RECOV);
-        EXIT;
+       if (evicted)
+               LCONSOLE_WARN("%s: disconnecting %d stale clients\n",
+                             obd->obd_name, evicted);
+
+       class_disconnect_export_list(&work_list, exp_flags_from_obd(obd) |
+                                                OBD_OPT_ABORT_RECOV);
+       EXIT;
 }
 EXPORT_SYMBOL(class_disconnect_stale_exports);
 
 }
 EXPORT_SYMBOL(class_disconnect_stale_exports);
 
index f4b0bec..1f7bf8e 100644 (file)
@@ -356,6 +356,10 @@ static int ofd_prepare(const struct lu_env *env, struct lu_device *pdev,
        rc = next->ld_ops->ldo_prepare(env, dev, next);
 
        target_recovery_init(&ofd->ofd_lut, ost_handle);
        rc = next->ld_ops->ldo_prepare(env, dev, next);
 
        target_recovery_init(&ofd->ofd_lut, ost_handle);
+       LASSERT(obd->obd_no_conn);
+       spin_lock(&obd->obd_dev_lock);
+       obd->obd_no_conn = 0;
+       spin_unlock(&obd->obd_dev_lock);
 
        if (obd->obd_recovering == 0)
                ofd_postrecov(env, ofd);
 
        if (obd->obd_recovering == 0)
                ofd_postrecov(env, ofd);
index 48e28a4..d218018 100644 (file)
@@ -1457,23 +1457,6 @@ out:
        return !!rc;
 }
 
        return !!rc;
 }
 
-static int ofd_obd_notify(struct obd_device *obd, struct obd_device *unused,
-                         enum obd_notify_event ev, void *data)
-{
-       switch (ev) {
-       case OBD_NOTIFY_CONFIG:
-               LASSERT(obd->obd_no_conn);
-               spin_lock(&obd->obd_dev_lock);
-               obd->obd_no_conn = 0;
-               spin_unlock(&obd->obd_dev_lock);
-               break;
-       default:
-               CDEBUG(D_INFO, "%s: Unhandled notification %#x\n",
-                      obd->obd_name, ev);
-       }
-       return 0;
-}
-
 /*
  * Handle quota control requests to consult current usage/limit.
  *
 /*
  * Handle quota control requests to consult current usage/limit.
  *
@@ -1527,6 +1510,5 @@ struct obd_ops ofd_obd_ops = {
        .o_precleanup           = ofd_precleanup,
        .o_ping                 = ofd_ping,
        .o_health_check         = ofd_health_check,
        .o_precleanup           = ofd_precleanup,
        .o_ping                 = ofd_ping,
        .o_health_check         = ofd_health_check,
-       .o_notify               = ofd_obd_notify,
        .o_quotactl             = ofd_quotactl,
 };
        .o_quotactl             = ofd_quotactl,
 };