From: rread Date: Sun, 18 Jul 2004 00:29:37 +0000 (+0000) Subject: b=3869,1742 X-Git-Tag: v1_7_100~2076 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=9641c7f5ea20ef6163752a3b4cd956a8ada40c9c;hp=738ac90f521a8e69a0a4f19c97e0f8911207eda6 b=3869,1742 These are the remaining fixes from 1742 that are needed to allow a client to reconnect during recover. The request that triggered 3869 on the last run was actually a RESENT request that was sent before recovery had finished, which this patch will fix. --- diff --git a/lustre/include/linux/lustre_export.h b/lustre/include/linux/lustre_export.h index 8cc24b9..664f936 100644 --- a/lustre/include/linux/lustre_export.h +++ b/lustre/include/linux/lustre_export.h @@ -77,8 +77,9 @@ struct obd_export { spinlock_t exp_lock; /* protects flags int below */ /* ^ protects exp_outstanding_replies too */ int exp_flags; - int exp_failed:1; - int exp_libclient:1; /* liblustre client? */ + int exp_failed:1, + exp_replay_needed:1, + exp_libclient:1; /* liblustre client? */ union { struct mds_export_data eu_mds_data; struct filter_export_data eu_filter_data; diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 7d28ef0..1183afe 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -887,6 +887,7 @@ int target_queue_recovery_request(struct ptlrpc_request *req, if (obd->obd_processing_task == current->pid || transno < obd->obd_next_recovery_transno) { /* Processing the queue right now, don't re-add. */ + lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT); LASSERT(list_empty(&req->rq_list)); spin_unlock_bh(&obd->obd_processing_task_lock); OBD_FREE(reqmsg, req->rq_reqlen); @@ -990,7 +991,12 @@ int target_queue_final_reply(struct ptlrpc_request *req, int rc) list_add(&req->rq_list, &obd->obd_delayed_reply_queue); spin_lock_bh(&obd->obd_processing_task_lock); - --obd->obd_recoverable_clients; + /* only count the first "replay over" request from each + export */ + if (req->rq_export->exp_replay_needed) { + --obd->obd_recoverable_clients; + req->rq_export->exp_replay_needed = 0; + } recovery_done = (obd->obd_recoverable_clients == 0); spin_unlock_bh(&obd->obd_processing_task_lock); diff --git a/lustre/mds/mds_fs.c b/lustre/mds/mds_fs.c index fccf34f..e32ba3f 100644 --- a/lustre/mds/mds_fs.c +++ b/lustre/mds/mds_fs.c @@ -327,6 +327,7 @@ static int mds_read_last_rcvd(struct obd_device *obd, struct file *file) spin_lock_init(&med->med_open_lock); mcd = NULL; + exp->exp_replay_needed = 1; obd->obd_recoverable_clients++; obd->obd_max_recoverable_clients++; class_export_put(exp); diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index 7adb22d..dc45318 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -485,7 +485,9 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp) spin_lock_init(&fed->fed_lock); fcd = NULL; + exp->exp_replay_needed = 1; obd->obd_recoverable_clients++; + obd->obd_max_recoverable_clients++; class_export_put(exp); CDEBUG(D_OTHER, "client at idx %d has last_rcvd = "LPU64"\n", diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 67e4393..3c5cf7e 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -785,9 +785,6 @@ static int osc_brw_prep_request(int cmd, struct obd_import *imp,struct obdo *oa, LASSERT((void *)(niobuf - niocount) == lustre_msg_buf(req->rq_reqmsg, 2, niocount * sizeof(*niobuf))); osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0); - spin_lock_irqsave(&req->rq_lock, flags); - req->rq_no_resend = 1; - spin_unlock_irqrestore(&req->rq_lock, flags); /* size[0] still sizeof (*body) */ if (opc == OST_WRITE) { @@ -908,8 +905,6 @@ restart_bulk: rc = osc_brw_prep_request(cmd, class_exp2cliimp(exp), oa, lsm, page_count, pga, &requested_nob, &niocount, &request); - /* NB ^ sets rq_no_resend */ - if (rc != 0) return (rc); @@ -938,13 +933,6 @@ static int brw_interpret(struct ptlrpc_request *request, struct brw_page *pga = aa->aa_pga; ENTRY; - /* XXX bug 937 here */ - if (rc == -ETIMEDOUT && request->rq_resend) { - DEBUG_REQ(D_HA, request, "BULK TIMEOUT"); - LBUG(); /* re-send. later. */ - //goto restart_bulk; - } - rc = osc_brw_fini_request(request, oa, requested_nob, niocount, page_count, pga, rc); RETURN (rc); @@ -964,8 +952,6 @@ static int async_internal(int cmd, struct obd_export *exp, struct obdo *oa, rc = osc_brw_prep_request(cmd, class_exp2cliimp(exp), oa, lsm, page_count, pga, &requested_nob, &nio_count, &request); - /* NB ^ sets rq_no_resend */ - if (rc == 0) { LASSERT(sizeof(*aa) <= sizeof(request->rq_async_args)); aa = (struct osc_brw_async_args *)&request->rq_async_args; diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index cf5e0be..5513345 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -100,6 +100,10 @@ int ptlrpc_set_import_discon(struct obd_import *imp) spin_lock_irqsave(&imp->imp_lock, flags); if (imp->imp_state == LUSTRE_IMP_FULL) { + CERROR("%s: connection lost to %s@%s\n", + imp->imp_obd->obd_name, + imp->imp_target_uuid.uuid, + imp->imp_connection->c_remote_uuid.uuid); IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON); spin_unlock_irqrestore(&imp->imp_lock, flags); obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON); @@ -407,6 +411,9 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request, if (imp->imp_invalid) { IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED); } else if (MSG_CONNECT_RECOVERING & msg_flags) { + CDEBUG(D_HA, "%s: reconnected to %s during replay\n", + imp->imp_obd->obd_name, + imp->imp_target_uuid.uuid); imp->imp_resend_replay = 1; IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY); } else { @@ -476,7 +483,15 @@ static int completed_replay_interpret(struct ptlrpc_request *req, void * data, int rc) { atomic_dec(&req->rq_import->imp_replay_inflight); - ptlrpc_import_recovery_state_machine(req->rq_import); + if (req->rq_status == 0) { + ptlrpc_import_recovery_state_machine(req->rq_import); + } else { + CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, " + "reconnecting\n", + req->rq_import->imp_obd->obd_name, req->rq_status); + ptlrpc_connect_import(req->rq_import, NULL); + } + RETURN(0); } @@ -557,6 +572,10 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp) GOTO(out, rc); IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL); ptlrpc_activate_import(imp); + CERROR("%s: connection restored to %s@%s\n", + imp->imp_obd->obd_name, + imp->imp_target_uuid.uuid, + imp->imp_connection->c_remote_uuid.uuid); } if (imp->imp_state == LUSTRE_IMP_FULL) {