Whamcloud - gitweb
- fix wrong flag check in ptlrpc_at_set_reply()
authortappro <tappro>
Wed, 8 Apr 2009 09:59:33 +0000 (09:59 +0000)
committertappro <tappro>
Wed, 8 Apr 2009 09:59:33 +0000 (09:59 +0000)
- remove staled clients before finish_recovery stage
- increase time for recovery if switched to vbr
b:18556
i:rread,zam,nathan

lustre/include/obd_support.h
lustre/ldlm/ldlm_lib.c
lustre/ptlrpc/niobuf.c
lustre/ptlrpc/target.c

index 825f4b7..860667d 100644 (file)
@@ -123,6 +123,9 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type,
    chance to generate adaptive timeout data. */
 #define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN,obd_timeout/2)
 #endif
+/* The max delay between connects is SWITCH_MAX + SWITCH_INC + INITIAL */
+#define RECONNECT_DELAY_MAX (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC + \
+                             INITIAL_CONNECT_TIMEOUT)
 #define LONG_UNLINK 300          /* Unlink should happen before now */
 
 /**
index da5c0c1..0b0e05f 100644 (file)
@@ -1700,6 +1700,13 @@ static int target_recovery_thread(void *arg)
                 class_disconnect_stale_exports(obd, connect_done,
                                                exp_flags_from_obd(obd) |
                                                OBD_OPT_ABORT_RECOV);
+                /**
+                 * if recovery proceeds with versions then some clients may be
+                 * timed out waiting for others and trying to reconnect.
+                 * Extend timer for such reconnect cases.
+                 */
+                if (obd->obd_version_recov)
+                        reset_recovery_timer(obd, RECONNECT_DELAY_MAX * 2, 1);
         }
 
         /* next stage: replay requests */
@@ -1765,6 +1772,10 @@ static int target_recovery_thread(void *arg)
 
         /* The third stage: reply on final pings */
         CDEBUG(D_INFO, "3: final stage - process recovery completion pings\n");
+        /** evict exports failed VBR */
+        class_disconnect_stale_exports(obd, req_vbr_done,
+                                       exp_flags_from_obd(obd) |
+                                       OBD_OPT_ABORT_RECOV);
         /** Update server last boot epoch */
         lut_boot_epoch_update(lut);
         /* We drop recoverying flag to forward all new requests
@@ -1779,17 +1790,13 @@ static int target_recovery_thread(void *arg)
                 handle_recovery_req(thread, req,
                                     trd->trd_recovery_handler);
         }
-        /* evict exports failed VBR */
-        class_disconnect_stale_exports(obd, req_vbr_done,
-                                       exp_flags_from_obd(obd) |
-                                       OBD_OPT_ABORT_RECOV);
 
         delta = (jiffies - delta) / HZ;
         CDEBUG(D_INFO,"4: recovery completed in %lus - %d/%d reqs/locks\n",
               delta, obd->obd_replayed_requests, obd->obd_replayed_locks);
         LASSERT(atomic_read(&obd->obd_req_replay_clients) == 0);
         LASSERT(atomic_read(&obd->obd_lock_replay_clients) == 0);
-        if (delta > obd_timeout * 2) {
+        if (delta > obd_timeout * OBD_RECOVERY_FACTOR) {
                 CWARN("too long recovery - read logs\n");
                 libcfs_debug_dumplog();
         }
index 9f99ecc..d52527c 100644 (file)
@@ -345,7 +345,8 @@ static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags)
             (req->rq_type != PTL_RPC_MSG_ERR) &&
             (req->rq_reqmsg != NULL) &&
             !(lustre_msg_get_flags(req->rq_reqmsg) &
-              (MSG_RESENT | MSG_REPLAY | MSG_LAST_REPLAY))) {
+              (MSG_RESENT | MSG_REPLAY |
+               MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE))) {
                 /* early replies, errors and recovery requests don't count
                  * toward our service time estimate */
                 int oldse = at_add(&svc->srv_at_estimate, service_time);
index 0be3950..92ab5a9 100644 (file)
@@ -276,7 +276,8 @@ void lut_boot_epoch_update(struct lu_target *lut)
          */
         list_for_each_entry(req, &client_list, rq_list) {
                 LASSERT(!req->rq_export->exp_delayed);
-                lut_client_epoch_update(&env, lut, req->rq_export);
+                if (!req->rq_export->exp_vbr_failed)
+                        lut_client_epoch_update(&env, lut, req->rq_export);
         }
         /** return list back at once */
         spin_lock_bh(&lut->lut_obd->obd_processing_task_lock);