From 3b887cffaea8f68285d458371b81358304815c7f Mon Sep 17 00:00:00 2001 From: tappro Date: Mon, 31 Aug 2009 06:00:19 +0000 Subject: [PATCH] Branch HEAD b=19844 i=rread i=fanyong Extend timer after each replay for a bigger value to cover also reconnection case which may happen (until bug18948 is fixed). Remove unused code --- lustre/include/obd.h | 4 ---- lustre/ldlm/ldlm_lib.c | 38 ++++++++++++++++++++++---------------- lustre/obdclass/genops.c | 5 +++-- 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 586f9fe..5550d89 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -1078,10 +1078,6 @@ struct obd_device { struct list_head obd_exports_timed; time_t obd_eviction_timer; /* for ping evictor */ - /* XXX encapsulate all this recovery data into one struct */ - svc_handler_t obd_recovery_handler; - pid_t obd_processing_task; - int obd_max_recoverable_clients; int obd_connected_clients; int obd_stale_clients; diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index ce3e65a..e0d3cba 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -1183,11 +1183,9 @@ static void target_finish_recovery(struct obd_device *obd) ldlm_reprocess_all_ns(obd->obd_namespace); spin_lock_bh(&obd->obd_processing_task_lock); - if (list_empty(&obd->obd_req_replay_queue) && - list_empty(&obd->obd_lock_replay_queue) && - list_empty(&obd->obd_final_req_queue)) { - obd->obd_processing_task = 0; - } else { + if (!list_empty(&obd->obd_req_replay_queue) || + !list_empty(&obd->obd_lock_replay_queue) || + !list_empty(&obd->obd_final_req_queue)) { CERROR("%s: Recovery queues ( %s%s%s) are not empty\n", obd->obd_name, list_empty(&obd->obd_req_replay_queue) ? "" : "req ", @@ -1198,6 +1196,8 @@ static void target_finish_recovery(struct obd_device *obd) } spin_unlock_bh(&obd->obd_processing_task_lock); + obd->obd_recovery_end = cfs_time_current_sec(); + /* when recovery finished, cleanup orphans on mds and ost */ if (OBT(obd) && OBP(obd, postrecov)) { int rc = OBP(obd, postrecov)(obd); @@ -1205,8 +1205,6 @@ static void target_finish_recovery(struct obd_device *obd) LCONSOLE_WARN("%s: Post recovery failed, rc %d\n", obd->obd_name, rc); } - - obd->obd_recovery_end = cfs_time_current_sec(); EXIT; } @@ -1573,7 +1571,7 @@ static int target_recovery_overseer(struct obd_device *obd, * reset timer, recovery will proceed with versions now, * timeout is set just to handle reconnection delays */ - reset_recovery_timer(obd, RECONNECT_DELAY_MAX * 2, 1); + reset_recovery_timer(obd, RECONNECT_DELAY_MAX, 1); /** Wait for recovery events again, after evicting bad clients */ } } while (!abort && expired); @@ -1688,11 +1686,16 @@ static int handle_recovery_req(struct ptlrpc_thread *thread, lu_context_exit(&req->rq_session); lu_context_fini(&req->rq_session); /* don't reset timer for final stage */ - if (!exp_finished(req->rq_export)) - reset_recovery_timer(class_exp2obd(req->rq_export), - AT_OFF ? obd_timeout : - at_get(&req->rq_rqbd->rqbd_service->srv_at_estimate), 1); - + if (!exp_finished(req->rq_export)) { + /** + * XXX: until bug 18948 is fixed (enable AT for request copy) + * the client may reconnect during recovery so we may need to + * wait RECONNECT_DELAY_MAX after each replay instead of + * at_get(&req->rq_rqbd->rqbd_service->srv_at_estimate); + */ + reset_recovery_timer(class_exp2obd(req->rq_export), AT_OFF ? + obd_timeout : RECONNECT_DELAY_MAX, 1); + } /** * bz18031: increase next_recovery_transno before ptlrpc_free_clone() * will drop exp_rpc reference @@ -1842,9 +1845,12 @@ void target_stop_recovery_thread(struct obd_device *obd) spin_lock_bh(&obd->obd_processing_task_lock); if (obd->obd_recovery_data.trd_processing_task > 0) { struct target_recovery_data *trd = &obd->obd_recovery_data; - CERROR("%s: Aborting recovery\n", obd->obd_name); - obd->obd_abort_recovery = 1; - cfs_waitq_signal(&obd->obd_next_transno_waitq); + /** recovery can be done but postrecovery is not yet */ + if (obd->obd_recovering) { + CERROR("%s: Aborting recovery\n", obd->obd_name); + obd->obd_abort_recovery = 1; + cfs_waitq_signal(&obd->obd_next_transno_waitq); + } spin_unlock_bh(&obd->obd_processing_task_lock); wait_for_completion(&trd->trd_finishing); } else { diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index ab08463..4055de6 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -1353,10 +1353,11 @@ void obd_exports_barrier(struct obd_device *obd) spin_unlock(&obd->obd_dev_lock); cfs_schedule_timeout(CFS_TASK_UNINT, cfs_time_seconds(waited)); if (waited > 5 && IS_PO2(waited)) { - LCONSOLE_WARN("Waiting for obd_unlinked_exports " + LCONSOLE_WARN("%s is waiting for obd_unlinked_exports " "more than %d seconds. " "The obd refcount = %d. Is it stuck?\n", - waited, atomic_read(&obd->obd_refcount)); + obd->obd_name, waited, + atomic_read(&obd->obd_refcount)); dump_exports(obd); } waited *= 2; -- 1.8.3.1