From 659bc7beea7e6a65c61b475f4b4401a02fe3c4cf Mon Sep 17 00:00:00 2001 From: tappro Date: Wed, 8 Apr 2009 09:59:33 +0000 Subject: [PATCH] - fix wrong flag check in ptlrpc_at_set_reply() - remove staled clients before finish_recovery stage - increase time for recovery if switched to vbr b:18556 i:rread,zam,nathan --- lustre/include/obd_support.h | 3 +++ lustre/ldlm/ldlm_lib.c | 17 ++++++++++++----- lustre/ptlrpc/niobuf.c | 3 ++- lustre/ptlrpc/target.c | 3 ++- 4 files changed, 19 insertions(+), 7 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 825f4b7..860667d 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -123,6 +123,9 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, chance to generate adaptive timeout data. */ #define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN,obd_timeout/2) #endif +/* The max delay between connects is SWITCH_MAX + SWITCH_INC + INITIAL */ +#define RECONNECT_DELAY_MAX (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC + \ + INITIAL_CONNECT_TIMEOUT) #define LONG_UNLINK 300 /* Unlink should happen before now */ /** diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index da5c0c1..0b0e05f 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -1700,6 +1700,13 @@ static int target_recovery_thread(void *arg) class_disconnect_stale_exports(obd, connect_done, exp_flags_from_obd(obd) | OBD_OPT_ABORT_RECOV); + /** + * if recovery proceeds with versions then some clients may be + * timed out waiting for others and trying to reconnect. + * Extend timer for such reconnect cases. + */ + if (obd->obd_version_recov) + reset_recovery_timer(obd, RECONNECT_DELAY_MAX * 2, 1); } /* next stage: replay requests */ @@ -1765,6 +1772,10 @@ static int target_recovery_thread(void *arg) /* The third stage: reply on final pings */ CDEBUG(D_INFO, "3: final stage - process recovery completion pings\n"); + /** evict exports failed VBR */ + class_disconnect_stale_exports(obd, req_vbr_done, + exp_flags_from_obd(obd) | + OBD_OPT_ABORT_RECOV); /** Update server last boot epoch */ lut_boot_epoch_update(lut); /* We drop recoverying flag to forward all new requests @@ -1779,17 +1790,13 @@ static int target_recovery_thread(void *arg) handle_recovery_req(thread, req, trd->trd_recovery_handler); } - /* evict exports failed VBR */ - class_disconnect_stale_exports(obd, req_vbr_done, - exp_flags_from_obd(obd) | - OBD_OPT_ABORT_RECOV); delta = (jiffies - delta) / HZ; CDEBUG(D_INFO,"4: recovery completed in %lus - %d/%d reqs/locks\n", delta, obd->obd_replayed_requests, obd->obd_replayed_locks); LASSERT(atomic_read(&obd->obd_req_replay_clients) == 0); LASSERT(atomic_read(&obd->obd_lock_replay_clients) == 0); - if (delta > obd_timeout * 2) { + if (delta > obd_timeout * OBD_RECOVERY_FACTOR) { CWARN("too long recovery - read logs\n"); libcfs_debug_dumplog(); } diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 9f99ecc..d52527c 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -345,7 +345,8 @@ static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags) (req->rq_type != PTL_RPC_MSG_ERR) && (req->rq_reqmsg != NULL) && !(lustre_msg_get_flags(req->rq_reqmsg) & - (MSG_RESENT | MSG_REPLAY | MSG_LAST_REPLAY))) { + (MSG_RESENT | MSG_REPLAY | + MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE))) { /* early replies, errors and recovery requests don't count * toward our service time estimate */ int oldse = at_add(&svc->srv_at_estimate, service_time); diff --git a/lustre/ptlrpc/target.c b/lustre/ptlrpc/target.c index 0be3950..92ab5a9 100644 --- a/lustre/ptlrpc/target.c +++ b/lustre/ptlrpc/target.c @@ -276,7 +276,8 @@ void lut_boot_epoch_update(struct lu_target *lut) */ list_for_each_entry(req, &client_list, rq_list) { LASSERT(!req->rq_export->exp_delayed); - lut_client_epoch_update(&env, lut, req->rq_export); + if (!req->rq_export->exp_vbr_failed) + lut_client_epoch_update(&env, lut, req->rq_export); } /** return list back at once */ spin_lock_bh(&lut->lut_obd->obd_processing_task_lock); -- 1.8.3.1