From: anserper Date: Tue, 5 May 2009 03:26:56 +0000 (+0000) Subject: b=18948 X-Git-Tag: GIT_EPOCH_B1_6~2^5~38 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=c7fbf07215dcdc5b7c48cb5f730907860c40e795;p=fs%2Flustre-release.git b=18948 o=Brian Behlendorf i=Nathan Rutman i=Robert Read Recovery console messages cleanup --- diff --git a/lustre/include/obd.h b/lustre/include/obd.h index a8d10b4..4dac00d 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -895,6 +895,7 @@ struct obd_device { int obd_max_recoverable_clients; int obd_connected_clients; int obd_recoverable_clients; + int obd_stale_clients; spinlock_t obd_processing_task_lock; /* BH lock (timer) */ pid_t obd_processing_task; /* thread to handle recovery queue */ diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 0af549c..6518523 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -1055,7 +1055,17 @@ static void target_release_saved_req(struct ptlrpc_request *req) static void target_finish_recovery(struct obd_device *obd) { + int max_clients = obd->obd_max_recoverable_clients; struct ptlrpc_request *req, *tmp; + time_t elapsed_time = max_t(time_t, 1, cfs_time_current_sec() - + obd->obd_recovery_start); + + LCONSOLE_INFO("%s: Recovery period over after %d:%.02d, of %d clients " + "%d recovered and %d %s evicted.\n", obd->obd_name, + (int)elapsed_time/60, (int)elapsed_time%60, max_clients, + max_clients - obd->obd_recoverable_clients, + obd->obd_stale_clients, + obd->obd_stale_clients == 1 ? "was" : "were"); ldlm_reprocess_all_ns(obd->obd_namespace); spin_lock_bh(&obd->obd_processing_task_lock); @@ -1071,8 +1081,9 @@ static void target_finish_recovery(struct obd_device *obd) /* when recovery finished, cleanup orphans on mds and ost */ if (OBT(obd) && OBP(obd, postrecov)) { int rc = OBP(obd, postrecov)(obd); - LCONSOLE_WARN("%s: recovery %s: rc %d\n", obd->obd_name, - rc < 0 ? "failed" : "complete", rc); + if (rc < 0) + LCONSOLE_WARN("%s: Post recovery failed, rc %d\n", + obd->obd_name, rc); } LCONSOLE_INFO("%s: sending delayed replies to recovered clients\n", @@ -1178,11 +1189,6 @@ void target_abort_recovery(void *data) target_cancel_recovery_timer(obd); spin_unlock_bh(&obd->obd_processing_task_lock); - LCONSOLE_WARN("%s: recovery period over; %d clients never reconnected " - "after %lds (%d clients did)\n", - obd->obd_name, obd->obd_recoverable_clients, - cfs_time_current_sec()- obd->obd_recovery_start, - obd->obd_connected_clients); class_disconnect_stale_exports(obd, flags); abort_recovery_queue(obd); @@ -1194,7 +1200,7 @@ void target_abort_recovery(void *data) static void target_recovery_expired(unsigned long castmeharder) { struct obd_device *obd = (struct obd_device *)castmeharder; - CERROR("%s: recovery timed out, aborting\n", obd->obd_name); + CDEBUG(D_HA, "%s: recovery timed out, aborting\n", obd->obd_name); spin_lock_bh(&obd->obd_processing_task_lock); if (obd->obd_recovering) obd->obd_abort_recovery = 1; @@ -1259,7 +1265,7 @@ static void check_and_start_recovery_timer(struct obd_device *obd, spin_unlock_bh(&obd->obd_processing_task_lock); return; } - CWARN("%s: starting recovery timer\n", obd->obd_name); + CDEBUG(D_HA, "%s: starting recovery timer\n", obd->obd_name); obd->obd_recovery_start = cfs_time_current_sec(); /* minimum */ obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout; diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index e436249..fc1ef33 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -2157,17 +2157,14 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) if (obd->obd_recovering) { LCONSOLE_WARN("MDT %s now serving %s (%s%s%s), but will be in " "recovery for at least %d:%.02d, or until %d " - "client%s reconnect. During this time new clients" - " will not be allowed to connect. " - "Recovery progress can be monitored by watching " - "/proc/fs/lustre/mds/%s/recovery_status.\n", + "client%s reconnect%s.\n", obd->obd_name, lustre_cfg_string(lcfg, 1), label ?: "", label ? "/" : "", str, obd->obd_recovery_timeout / 60, obd->obd_recovery_timeout % 60, obd->obd_max_recoverable_clients, (obd->obd_max_recoverable_clients == 1) ? "":"s", - obd->obd_name); + (obd->obd_max_recoverable_clients == 1) ? "s":""); } else { LCONSOLE_INFO("MDT %s now serving %s (%s%s%s) with recovery " "%s\n", obd->obd_name, lustre_cfg_string(lcfg, 1), diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index 8118130..100be31 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -857,7 +857,7 @@ int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len, } case OBD_IOC_ABORT_RECOVERY: - CERROR("aborting recovery for device %s\n", obd->obd_name); + LCONSOLE_WARN("%s: Aborting recovery.\n", obd->obd_name); target_abort_recovery(obd); /* obd_recovering has been changed */ mds_allow_cli(obd, 0); diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index bc12c28..98aef23 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -1044,23 +1044,24 @@ void class_disconnect_stale_exports(struct obd_device *obd, struct list_head work_list; struct list_head *pos, *n; struct obd_export *exp; - int cnt = 0; ENTRY; CFS_INIT_LIST_HEAD(&work_list); spin_lock(&obd->obd_dev_lock); + obd->obd_stale_clients = 0; list_for_each_safe(pos, n, &obd->obd_exports) { exp = list_entry(pos, struct obd_export, exp_obd_chain); if (exp->exp_replay_needed) { list_del(&exp->exp_obd_chain); list_add(&exp->exp_obd_chain, &work_list); - cnt++; + obd->obd_stale_clients++; } } spin_unlock(&obd->obd_dev_lock); - CDEBUG(D_ERROR, "%s: disconnecting %d stale clients\n", - obd->obd_name, cnt); + CDEBUG(D_HA, "%s: disconnecting %d stale clients\n", obd->obd_name, + obd->obd_stale_clients); + class_disconnect_export_list(&work_list, flags); EXIT; } diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index a74f29b..d9a347b 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -1854,17 +1854,14 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf, if (obd->obd_recovering) { LCONSOLE_WARN("OST %s now serving %s (%s%s%s), but will be in " "recovery for at least %d:%.02d, or until %d " - "client%s reconnect. During this time new clients" - " will not be allowed to connect. " - "Recovery progress can be monitored by watching " - "/proc/fs/lustre/obdfilter/%s/recovery_status.\n", + "client%s reconnect%s.\n", obd->obd_name, lustre_cfg_string(lcfg, 1), label ?: "", label ? "/" : "", str, obd->obd_recovery_timeout / 60, obd->obd_recovery_timeout % 60, obd->obd_max_recoverable_clients, - (obd->obd_max_recoverable_clients == 1) ? "":"s", - obd->obd_name); + obd->obd_max_recoverable_clients == 1 ? "":"s", + obd->obd_max_recoverable_clients == 1 ? "s":""); } else { LCONSOLE_INFO("OST %s now serving %s (%s%s%s) with recovery " "%s\n", obd->obd_name, lustre_cfg_string(lcfg, 1), @@ -2862,7 +2859,7 @@ static int filter_destroy_precreated(struct obd_export *exp, struct obdo *oa, last = filter_last_id(filter, doa.o_gr); skip_orphan = !!(exp->exp_connect_flags & OBD_CONNECT_SKIP_ORPHAN); - CWARN("%s: deleting orphan objects from "LPU64" to "LPU64"%s\n", + CDEBUG(D_HA, "%s: deleting orphan objects from "LPU64" to "LPU64"%s\n", exp->exp_obd->obd_name, oa->o_id + 1, last, skip_orphan ? ", orphan objids won't be reused any more." : "."); diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c index 7489cc5..d1b0c8a 100644 --- a/lustre/osc/osc_create.c +++ b/lustre/osc/osc_create.c @@ -374,7 +374,8 @@ int osc_create(struct obd_export *exp, struct obdo *oa, oscc->oscc_last_id = oa->o_id; ocd = &imp->imp_connect_data; if (ocd->ocd_connect_flags & OBD_CONNECT_SKIP_ORPHAN) { - CWARN("Skip orphan set, reset last objid\n"); + CDEBUG(D_HA, "%s: Skip orphan set, reset last " + "objid\n", oscc->oscc_obd->obd_name); oscc->oscc_next_id = oa->o_id + 1; } diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 42042cb..683bf4c 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -848,23 +848,36 @@ static int ptlrpc_check_reply(struct ptlrpc_request *req) return rc; } -static int ptlrpc_check_status(struct ptlrpc_request *req) +/* Conditionally suppress specific console messages */ +static int ptlrpc_console_allow(struct ptlrpc_request *req) { + __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); int err; - ENTRY; - err = lustre_msg_get_status(req->rq_repmsg); - if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) { - struct obd_import *imp = req->rq_import; - __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + /* Suppress particular reconnect errors which are to be expected. No + * errors are suppressed for the initial connection on an import */ + if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) && + (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) { - LCONSOLE_ERROR_MSG(0x011,"an error occurred while communicating" - " with %s. The %s operation failed with %d\n", - libcfs_nid2str(imp->imp_connection->c_peer.nid), - ll_opcode2str(opc), err); - RETURN(err < 0 ? err : -EINVAL); + /* Suppress timed out reconnect requests */ + if (req->rq_timedout) + return 0; + + /* Suppress unavailable/again reconnect requests */ + err = lustre_msg_get_status(req->rq_repmsg); + if (err == -ENODEV || err == -EAGAIN) + return 0; } + return 1; +} + +static int ptlrpc_check_status(struct ptlrpc_request *req) +{ + int err; + ENTRY; + + err = lustre_msg_get_status(req->rq_repmsg); if (err < 0) { DEBUG_REQ(D_INFO, req, "status is %d", err); } else if (err > 0) { @@ -872,6 +885,21 @@ static int ptlrpc_check_status(struct ptlrpc_request *req) DEBUG_REQ(D_INFO, req, "status is %d", err); } + if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) { + struct obd_import *imp = req->rq_import; + __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + + if (ptlrpc_console_allow(req)) + LCONSOLE_ERROR_MSG(0x011,"an error occurred while " + "communicating with %s. The %s " + "operation failed with %d\n", + libcfs_nid2str( + imp->imp_connection->c_peer.nid), + ll_opcode2str(opc), err); + + RETURN(err < 0 ? err : -EINVAL); + } + RETURN(err); } @@ -1312,7 +1340,11 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink) req->rq_net_err ? "network error" : "timeout", (long)req->rq_sent, cfs_time_current_sec() - req->rq_sent); - if (imp) { + spin_lock(&req->rq_lock); + req->rq_timedout = 1; + spin_unlock(&req->rq_lock); + + if (imp && ptlrpc_console_allow(req)) { LCONSOLE_WARN("Request x"LPU64" sent from %s to NID %s %lus ago" " has timed out (limit %lus).\n", req->rq_xid, req->rq_import->imp_obd->obd_name, @@ -1324,10 +1356,6 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink) if (imp != NULL && obd_debug_peer_on_timeout) LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer); - spin_lock(&req->rq_lock); - req->rq_timedout = 1; - spin_unlock(&req->rq_lock); - ptlrpc_unregister_reply(req, async_unlink); ptlrpc_unregister_bulk(req, async_unlink); diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 5e6d08a..fd00220 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -512,10 +512,9 @@ static int import_select_connection(struct obd_import *imp) if (imp->imp_conn_current != imp_conn) { if (imp->imp_conn_current) - LCONSOLE_INFO("Changing connection for %s to %s/%s\n", - imp->imp_obd->obd_name, - imp_conn->oic_uuid.uuid, - libcfs_nid2str(imp_conn->oic_conn->c_peer.nid)); + CDEBUG(D_HA, "Changing connection for %s to %s/%s\n", + imp->imp_obd->obd_name, imp_conn->oic_uuid.uuid, + libcfs_nid2str(imp_conn->oic_conn->c_peer.nid)); imp->imp_conn_current = imp_conn; }