X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fptlrpc%2Frecover.c;h=6bf1d35e4bd43afbd5e688ab8781dc7a7511a900;hb=fdb061c9ce3fd16e805ff07b1e80c8de46110004;hp=18bc6f4c891c4289077bf8167a7ccea0b787dcc4;hpb=a888a27ac14736d3df8e730a3909d026d6f40f49;p=fs%2Flustre-release.git diff --git a/lustre/ptlrpc/recover.c b/lustre/ptlrpc/recover.c index 18bc6f4..6bf1d35 100644 --- a/lustre/ptlrpc/recover.c +++ b/lustre/ptlrpc/recover.c @@ -54,7 +54,7 @@ void ptlrpc_run_recovery_over_upcall(struct obd_device *obd) argv[0] = obd_lustre_upcall; argv[1] = "RECOVERY_OVER"; - argv[2] = obd->obd_uuid.uuid; + argv[2] = (char *)obd->obd_uuid.uuid; argv[3] = NULL; envp[0] = "HOME=/"; @@ -68,8 +68,8 @@ void ptlrpc_run_recovery_over_upcall(struct obd_device *obd) argv[0], argv[1], argv[2], rc); } else { - CERROR("Invoked upcall %s %s %s\n", - argv[0], argv[1], argv[2]); + CWARN("Invoked upcall %s %s %s\n", + argv[0], argv[1], argv[2]); } } @@ -92,10 +92,10 @@ void ptlrpc_run_failed_import_upcall(struct obd_import* imp) argv[0] = obd_lustre_upcall; argv[1] = "FAILED_IMPORT"; - argv[2] = imp->imp_target_uuid.uuid; + argv[2] = (char *)imp->imp_target_uuid.uuid; argv[3] = imp->imp_obd->obd_name; - argv[4] = imp->imp_connection->c_remote_uuid.uuid; - argv[5] = imp->imp_obd->obd_uuid.uuid; + argv[4] = (char *)imp->imp_connection->c_remote_uuid.uuid; + argv[5] = (char *)imp->imp_obd->obd_uuid.uuid; argv[6] = NULL; envp[0] = "HOME=/"; @@ -105,12 +105,12 @@ void ptlrpc_run_failed_import_upcall(struct obd_import* imp) rc = USERMODEHELPER(argv[0], argv, envp); if (rc < 0) { CERROR("Error invoking recovery upcall %s %s %s %s %s: %d; " - "check /proc/sys/lustre/lustre_upcall\n", + "check /proc/sys/lustre/upcall\n", argv[0], argv[1], argv[2], argv[3], argv[4],rc); } else { - CERROR("Invoked upcall %s %s %s %s %s\n", - argv[0], argv[1], argv[2], argv[3], argv[4]); + CWARN("Invoked upcall %s %s %s %s %s\n", + argv[0], argv[1], argv[2], argv[3], argv[4]); } #else if (imp->imp_state == LUSTRE_IMP_CLOSED) { @@ -130,16 +130,16 @@ void ptlrpc_initiate_recovery(struct obd_import *imp) LASSERT (obd_lustre_upcall != NULL); if (strcmp(obd_lustre_upcall, "DEFAULT") == 0) { - CDEBUG(D_ERROR, "%s: starting recovery without upcall\n", + CDEBUG(D_HA, "%s: starting recovery without upcall\n", imp->imp_target_uuid.uuid); ptlrpc_connect_import(imp, NULL); } else if (strcmp(obd_lustre_upcall, "NONE") == 0) { - CDEBUG(D_ERROR, "%s: recovery diabled\n", + CDEBUG(D_HA, "%s: recovery disabled\n", imp->imp_target_uuid.uuid); } else { - CDEBUG(D_ERROR, "%s: calling upcall to start recovery\n", + CDEBUG(D_HA, "%s: calling upcall to start recovery\n", imp->imp_target_uuid.uuid); ptlrpc_run_failed_import_upcall(imp); } @@ -151,7 +151,7 @@ int ptlrpc_replay_next(struct obd_import *imp, int *inflight) { int rc = 0; struct list_head *tmp, *pos; - struct ptlrpc_request *req; + struct ptlrpc_request *req = NULL; unsigned long flags; __u64 last_transno; ENTRY; @@ -187,16 +187,35 @@ int ptlrpc_replay_next(struct obd_import *imp, int *inflight) */ list_for_each_safe(tmp, pos, &imp->imp_replay_list) { req = list_entry(tmp, struct ptlrpc_request, rq_replay_list); + + /* If need to resend, stop on the matching one first. It's + possible though it's already been committed, so in that case + we'll just continue with replay */ + if (imp->imp_resend_replay && + req->rq_transno == last_transno) { + lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT); + break; + } + if (req->rq_transno > last_transno) { - rc = ptlrpc_replay_req(req); - if (rc) { - CERROR("recovery replay error %d for req " - LPD64"\n", rc, req->rq_xid); - RETURN(rc); - } - *inflight = 1; + imp->imp_last_replay_transno = req->rq_transno; break; } + + req = NULL; + } + + imp->imp_resend_replay = 0; + + if (req != NULL) { + rc = ptlrpc_replay_req(req); + if (rc) { + CERROR("recovery replay error %d for req " + LPD64"\n", rc, req->rq_xid); + RETURN(rc); + } + imp->imp_reqs_replayed++; + *inflight = 1; } RETURN(rc); } @@ -268,17 +287,17 @@ void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req) ptlrpc_deactivate_import(imp); } - rc = ptlrpc_connect_import(imp, NULL); + /* to control recovery via lctl {disable|enable}_recovery */ + if (imp->imp_deactive == 0) + rc = ptlrpc_connect_import(imp, NULL); } - /* Wait for recovery to complete and resend. If evicted, then this request will be errored out later.*/ spin_lock_irqsave(&failed_req->rq_lock, flags); - if (!failed_req->rq_no_resend) - failed_req->rq_resend = 1; + failed_req->rq_resend = 1; spin_unlock_irqrestore(&failed_req->rq_lock, flags); - + EXIT; } @@ -286,7 +305,7 @@ void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req) * This should only be called by the ioctl interface, currently * with the lctl deactivate and activate commands. */ -int ptlrpc_set_import_active(struct obd_import *imp, int active) +int ptlrpc_set_import_active(struct obd_import *imp, int active) { struct obd_device *obd = imp->imp_obd; int rc = 0; @@ -297,10 +316,12 @@ int ptlrpc_set_import_active(struct obd_import *imp, int active) * requests. */ if (!active) { ptlrpc_invalidate_import(imp, 0); - } + imp->imp_deactive = 1; + } /* When activating, mark import valid, and attempt recovery */ if (active) { + imp->imp_deactive = 0; CDEBUG(D_HA, "setting import %s VALID\n", imp->imp_target_uuid.uuid); rc = ptlrpc_recover_import(imp, NULL); @@ -313,10 +334,10 @@ int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid) { int rc; ENTRY; - + /* force import to be disconnected. */ ptlrpc_set_import_discon(imp); - + rc = ptlrpc_recover_import_no_retry(imp, new_uuid); RETURN(rc); @@ -335,6 +356,24 @@ int ptlrpc_import_in_recovery(struct obd_import *imp) return in_recovery; } +int ptlrpc_import_control_recovery(struct obd_import *imp, int disable) +{ + unsigned long flags; + + /* with imp_deactivate == 1 pinger won't initiate re-connect */ + spin_lock_irqsave(&imp->imp_lock, flags); + if (disable) + imp->imp_deactive = 1; + else + imp->imp_deactive = 0; + if (imp->imp_state == LUSTRE_IMP_DISCON) { + imp->imp_force_verify = 1; + ptlrpc_pinger_wake_up(); + } + spin_unlock_irqrestore(&imp->imp_lock, flags); + RETURN(0); +} + static int ptlrpc_recover_import_no_retry(struct obd_import *imp, char *new_uuid) { @@ -357,13 +396,13 @@ static int ptlrpc_recover_import_no_retry(struct obd_import *imp, if (rc) RETURN(rc); - CDEBUG(D_ERROR, "%s: recovery started, waiting\n", + CDEBUG(D_HA, "%s: recovery started, waiting\n", imp->imp_target_uuid.uuid); lwi = LWI_TIMEOUT(MAX(obd_timeout * HZ, 1), NULL, NULL); rc = l_wait_event(imp->imp_recovery_waitq, !ptlrpc_import_in_recovery(imp), &lwi); - CDEBUG(D_ERROR, "%s: recovery finished\n", + CDEBUG(D_HA, "%s: recovery finished\n", imp->imp_target_uuid.uuid); RETURN(rc); @@ -388,6 +427,9 @@ void ptlrpc_fail_export(struct obd_export *exp) CDEBUG(D_HA, "disconnecting export %p/%s\n", exp, exp->exp_client_uuid.uuid); + if (obd_dump_on_timeout) + portals_debug_dumplog(); + /* Most callers into obd_disconnect are removing their own reference * (request, for example) in addition to the one from the hash table. * We don't have such a reference here, so make one. */