From 3ee7ec78e2fa14fc8434f5224653455f4f58bebf Mon Sep 17 00:00:00 2001 From: anserper Date: Tue, 5 May 2009 21:08:28 +0000 Subject: [PATCH] =?utf8?q?=EF=BF=BD=EF=BF=BD=20b=3D18948=20=EF=BF=BD?= =?utf8?q?=EF=BF=BD=20o=3DBrian=20Behlendorf=20=EF=BF=BD=EF=BF=BD=20i=3DNa?= =?utf8?q?than=20Rutman=20=EF=BF=BD=EF=BF=BD=20i=3DRobert=20Read=20?= =?utf8?q?=EF=BF=BD=EF=BF=BD=20=EF=BF=BD=EF=BF=BD=20Recovery=20console=20m?= =?utf8?q?essages=20cleanup?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit --- lustre/include/obd.h | 1 + lustre/ldlm/ldlm_lib.c | 27 +++++++++++++++++-------- lustre/mds/handler.c | 7 ++----- lustre/mds/mds_lov.c | 2 +- lustre/obdclass/genops.c | 8 ++++---- lustre/obdfilter/filter.c | 9 +++------ lustre/osc/osc_create.c | 3 ++- lustre/ptlrpc/client.c | 50 ++++++++++++++++++++++++++++++++++++----------- lustre/ptlrpc/import.c | 7 +++---- 9 files changed, 74 insertions(+), 40 deletions(-) diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 8101b35..cfeb383 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -975,6 +975,7 @@ struct obd_device { int obd_max_recoverable_clients; int obd_connected_clients; int obd_recoverable_clients; + int obd_stale_clients; int obd_delayed_clients; spinlock_t obd_processing_task_lock; /* BH lock (timer) */ pid_t obd_processing_task; diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index e66e2aa..57f3f2e 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -1115,7 +1115,17 @@ static void target_release_saved_req(struct ptlrpc_request *req) static void target_send_delayed_replies(struct obd_device *obd) { + int max_clients = obd->obd_max_recoverable_clients; struct ptlrpc_request *req, *tmp; + time_t elapsed_time = max_t(time_t, 1, cfs_time_current_sec() - + obd->obd_recovery_start); + + LCONSOLE_INFO("%s: Recovery period over after %d:%.02d, of %d clients " + "%d recovered and %d %s evicted.\n", obd->obd_name, + (int)elapsed_time/60, (int)elapsed_time%60, max_clients, + max_clients - obd->obd_recoverable_clients, + obd->obd_stale_clients, + obd->obd_stale_clients == 1 ? "was" : "were"); LCONSOLE_INFO("%s: sending delayed replies to recovered clients\n", obd->obd_name); @@ -1147,8 +1157,9 @@ static void target_finish_recovery(struct obd_device *obd) /* when recovery finished, cleanup orphans on mds and ost */ if (OBT(obd) && OBP(obd, postrecov)) { int rc = OBP(obd, postrecov)(obd); - LCONSOLE_WARN("%s: recovery %s: rc %d\n", obd->obd_name, - rc < 0 ? "failed" : "complete", rc); + if (rc < 0) + LCONSOLE_WARN("%s: Post recovery failed, rc %d\n", + obd->obd_name, rc); } target_send_delayed_replies(obd); } @@ -1272,11 +1283,11 @@ static void reset_recovery_timer(struct obd_device *, int, int); static void target_recovery_expired(unsigned long castmeharder) { struct obd_device *obd = (struct obd_device *)castmeharder; - LCONSOLE_WARN("%s: recovery period over; %d clients never reconnected " - "after %lds (%d clients did)\n", - obd->obd_name, obd->obd_recoverable_clients, - cfs_time_current_sec()- obd->obd_recovery_start, - obd->obd_connected_clients); + CDEBUG(D_HA, "%s: recovery period over; %d clients never reconnected " + "after %lds (%d clients did)\n", obd->obd_name, + obd->obd_recoverable_clients, + cfs_time_current_sec() - obd->obd_recovery_start, + obd->obd_connected_clients); spin_lock_bh(&obd->obd_processing_task_lock); obd->obd_abort_recovery = 1; @@ -1340,7 +1351,7 @@ static void check_and_start_recovery_timer(struct obd_device *obd, spin_unlock_bh(&obd->obd_processing_task_lock); return; } - CWARN("%s: starting recovery timer\n", obd->obd_name); + CDEBUG(D_HA, "%s: starting recovery timer\n", obd->obd_name); obd->obd_recovery_start = cfs_time_current_sec(); /* minimum */ obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout; diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index 7d4396d..4b02b98 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -2103,17 +2103,14 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) if (obd->obd_recovering) { LCONSOLE_WARN("MDT %s now serving %s (%s%s%s), but will be in " "recovery for at least %d:%.02d, or until %d " - "client%s reconnect. During this time new clients" - " will not be allowed to connect. " - "Recovery progress can be monitored by watching " - "/proc/fs/lustre/mds/%s/recovery_status.\n", + "client%s reconnect%s. \n", obd->obd_name, lustre_cfg_string(lcfg, 1), label ?: "", label ? "/" : "", str, obd->obd_recovery_timeout / 60, obd->obd_recovery_timeout % 60, obd->obd_recoverable_clients, (obd->obd_recoverable_clients == 1) ? "":"s", - obd->obd_name); + (obd->obd_recoverable_clients == 1) ? "s":""); } else { LCONSOLE_INFO("MDT %s now serving %s (%s%s%s) with recovery " "%s\n", obd->obd_name, lustre_cfg_string(lcfg, 1), diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index c9270ab..45ff9f7 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -923,7 +923,7 @@ int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len, } case OBD_IOC_ABORT_RECOVERY: - CERROR("aborting recovery for device %s\n", obd->obd_name); + LCONSOLE_WARN("%s: Aborting recovery.\n", obd->obd_name); target_abort_recovery(obd); /* obd_recovering has been changed */ mds_allow_cli(obd, 0); diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index cdae4f4..fe78085 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -1059,22 +1059,22 @@ void class_disconnect_stale_exports(struct obd_device *obd, struct list_head work_list; struct list_head *pos, *n; struct obd_export *exp; - int cnt = 0; ENTRY; CFS_INIT_LIST_HEAD(&work_list); spin_lock(&obd->obd_dev_lock); + obd->obd_stale_clients = 0; list_for_each_safe(pos, n, &obd->obd_exports) { exp = list_entry(pos, struct obd_export, exp_obd_chain); if (exp->exp_replay_needed) { list_move(&exp->exp_obd_chain, &work_list); - cnt++; + obd->obd_stale_clients++; } } spin_unlock(&obd->obd_dev_lock); - CDEBUG(D_ERROR, "%s: disconnecting %d stale clients\n", - obd->obd_name, cnt); + CDEBUG(D_HA, "%s: disconnecting %d stale clients\n", + obd->obd_name, obd->obd_stale_clients); class_disconnect_export_list(&work_list, flags); EXIT; } diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index 5f49442..0337d6b 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -1990,17 +1990,14 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf, if (obd->obd_recovering) { LCONSOLE_WARN("OST %s now serving %s (%s%s%s), but will be in " "recovery for at least %d:%.02d, or until %d " - "client%s reconnect. During this time new clients" - " will not be allowed to connect. " - "Recovery progress can be monitored by watching " - "/proc/fs/lustre/obdfilter/%s/recovery_status.\n", + "client%s reconnect%s.\n", obd->obd_name, lustre_cfg_string(lcfg, 1), label ?: "", label ? "/" : "", str, obd->obd_recovery_timeout / 60, obd->obd_recovery_timeout % 60, obd->obd_recoverable_clients, - (obd->obd_recoverable_clients == 1) ? "":"s", - obd->obd_name); + obd->obd_recoverable_clients == 1 ? "":"s", + obd->obd_recoverable_clients == 1 ? "s":""); } else { LCONSOLE_INFO("OST %s now serving %s (%s%s%s) with recovery " "%s\n", obd->obd_name, lustre_cfg_string(lcfg, 1), diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c index a5ce2f2..af45e6c 100644 --- a/lustre/osc/osc_create.c +++ b/lustre/osc/osc_create.c @@ -376,7 +376,8 @@ int osc_create(struct obd_export *exp, struct obdo *oa, oscc->oscc_last_id = oa->o_id; ocd = &imp->imp_connect_data; if (ocd->ocd_connect_flags & OBD_CONNECT_SKIP_ORPHAN) { - CWARN("Skip orphan set, reset last objid\n"); + CDEBUG(D_HA, "%s: Skip orphan set, reset last " + "objid\n", oscc->oscc_obd->obd_name); oscc->oscc_next_id = oa->o_id + 1; } diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index ff0aa14..9e77607 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -859,23 +859,36 @@ static int ptlrpc_check_reply(struct ptlrpc_request *req) return rc; } -static int ptlrpc_check_status(struct ptlrpc_request *req) +/* Conditionally suppress specific console messages */ +static int ptlrpc_console_allow(struct ptlrpc_request *req) { + __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); int err; - ENTRY; - err = lustre_msg_get_status(req->rq_repmsg); - if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) { - struct obd_import *imp = req->rq_import; - __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + /* Suppress particular reconnect errors which are to be expected. No + * errors are suppressed for the initial connection on an import */ + if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) && + (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) { - LCONSOLE_ERROR_MSG(0x011,"an error occurred while communicating" - " with %s. The %s operation failed with %d\n", - libcfs_nid2str(imp->imp_connection->c_peer.nid), - ll_opcode2str(opc), err); - RETURN(err < 0 ? err : -EINVAL); + /* Suppress timed out reconnect requests */ + if (req->rq_timedout) + return 0; + + /* Suppress unavailable/again reconnect requests */ + err = lustre_msg_get_status(req->rq_repmsg); + if (err == -ENODEV || err == -EAGAIN) + return 0; } + return 1; +} + +static int ptlrpc_check_status(struct ptlrpc_request *req) +{ + int err; + ENTRY; + + err = lustre_msg_get_status(req->rq_repmsg); if (err < 0) { DEBUG_REQ(D_INFO, req, "status is %d", err); } else if (err > 0) { @@ -883,6 +896,21 @@ static int ptlrpc_check_status(struct ptlrpc_request *req) DEBUG_REQ(D_INFO, req, "status is %d", err); } + if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) { + struct obd_import *imp = req->rq_import; + __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + + if (ptlrpc_console_allow(req)) + LCONSOLE_ERROR_MSG(0x011,"an error occurred while " + "communicating with %s. The %s " + "operation failed with %d\n", + libcfs_nid2str( + imp->imp_connection->c_peer.nid), + ll_opcode2str(opc), err); + + RETURN(err < 0 ? err : -EINVAL); + } + RETURN(err); } diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 38caf6a..f7cc6df 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -524,10 +524,9 @@ static int import_select_connection(struct obd_import *imp) if (imp->imp_conn_current != imp_conn) { if (imp->imp_conn_current) - LCONSOLE_INFO("Changing connection for %s to %s/%s\n", - imp->imp_obd->obd_name, - imp_conn->oic_uuid.uuid, - libcfs_nid2str(imp_conn->oic_conn->c_peer.nid)); + CDEBUG(D_HA, "Changing connection for %s to %s/%s\n", + imp->imp_obd->obd_name, imp_conn->oic_uuid.uuid, + libcfs_nid2str(imp_conn->oic_conn->c_peer.nid)); imp->imp_conn_current = imp_conn; } -- 1.8.3.1