From eda8eab5abeef1117ae1082533b5888a91875e12 Mon Sep 17 00:00:00 2001 From: anserper Date: Tue, 19 May 2009 18:22:19 +0000 Subject: [PATCH] b=18948 i=Andreas Dilger i=Nathan Rutman o=Brian Behlendorf Snappy Recovery v4 --- lustre/include/obd.h | 1 + lustre/include/obd_class.h | 6 +++--- lustre/ldlm/ldlm_lib.c | 24 +++++++++++------------ lustre/obdclass/genops.c | 17 ++++++++-------- lustre/obdfilter/filter.c | 11 ++++------- lustre/osc/osc_create.c | 3 ++- lustre/ptlrpc/client.c | 48 ++++++++++++++++++++++++++++++++++++++++++---- lustre/ptlrpc/import.c | 7 +++---- 8 files changed, 78 insertions(+), 39 deletions(-) diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 4a0d5f72..c98c730 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -1073,6 +1073,7 @@ struct obd_device { int obd_max_recoverable_clients; int obd_connected_clients; int obd_recoverable_clients; + int obd_stale_clients; int obd_delayed_clients; spinlock_t obd_processing_task_lock; /* BH lock (timer) */ __u64 obd_next_recovery_transno; diff --git a/lustre/include/obd_class.h b/lustre/include/obd_class.h index dba5e2d..3ea3a79 100644 --- a/lustre/include/obd_class.h +++ b/lustre/include/obd_class.h @@ -207,9 +207,9 @@ int class_disconnect(struct obd_export *exp); void class_fail_export(struct obd_export *exp); void class_disconnect_exports(struct obd_device *obddev); int class_manual_cleanup(struct obd_device *obd); -int class_disconnect_stale_exports(struct obd_device *, - int (*test_export)(struct obd_export *), - enum obd_option flags); +void class_disconnect_stale_exports(struct obd_device *, + int (*test_export)(struct obd_export *), + enum obd_option flags); static inline enum obd_option exp_flags_from_obd(struct obd_device *obd) { diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index c394253..234048d 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -1208,8 +1208,9 @@ static void target_finish_recovery(struct obd_device *obd) /* when recovery finished, cleanup orphans on mds and ost */ if (OBT(obd) && OBP(obd, postrecov)) { int rc = OBP(obd, postrecov)(obd); - LCONSOLE_WARN("%s: recovery %s: rc %d\n", obd->obd_name, - rc < 0 ? "failed" : "complete", rc); + if (rc < 0) + LCONSOLE_WARN("%s: Post recovery failed, rc %d\n", + obd->obd_name, rc); } obd->obd_recovery_end = cfs_time_current_sec(); @@ -1363,7 +1364,7 @@ static void check_and_start_recovery_timer(struct obd_device *obd) spin_unlock_bh(&obd->obd_processing_task_lock); return; } - CWARN("%s: starting recovery timer\n", obd->obd_name); + CDEBUG(D_HA, "%s: starting recovery timer\n", obd->obd_name); obd->obd_recovery_start = cfs_time_current_sec(); /* minimum */ obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout; @@ -1748,11 +1749,10 @@ static int target_recovery_thread(void *arg) /* If some clients haven't replayed requests in time, evict them */ if (obd->obd_abort_recovery) { - int stale; CERROR("lock replay is aborted\n"); - stale = class_disconnect_stale_exports(obd, lock_replay_done, - exp_flags_from_obd(obd) | - OBD_OPT_ABORT_RECOV); + class_disconnect_stale_exports(obd, lock_replay_done, + exp_flags_from_obd(obd) | + OBD_OPT_ABORT_RECOV); abort_lock_replay_queue(obd); } LASSERT(list_empty(&obd->obd_lock_replay_queue)); @@ -1844,11 +1844,11 @@ EXPORT_SYMBOL(target_recovery_fini); static void target_recovery_expired(unsigned long castmeharder) { struct obd_device *obd = (struct obd_device *)castmeharder; - LCONSOLE_WARN("%s: recovery timed out; %d clients never reconnected " - "after %lds (%d clients did)\n", - obd->obd_name, obd->obd_recoverable_clients, - cfs_time_current_sec()- obd->obd_recovery_start, - obd->obd_connected_clients); + CDEBUG(D_HA, "%s: recovery timed out; %d clients never reconnected " + "after %lds (%d clients did)\n", + obd->obd_name, obd->obd_recoverable_clients, + cfs_time_current_sec()- obd->obd_recovery_start, + obd->obd_connected_clients); spin_lock_bh(&obd->obd_processing_task_lock); obd->obd_version_recov = 1; diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index cb3c85d..e4330a5 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -1157,18 +1157,18 @@ EXPORT_SYMBOL(class_disconnect_exports); /* Remove exports that have not completed recovery. */ -int class_disconnect_stale_exports(struct obd_device *obd, - int (*test_export)(struct obd_export *), - enum obd_option flags) +void class_disconnect_stale_exports(struct obd_device *obd, + int (*test_export)(struct obd_export *), + enum obd_option flags) { struct list_head work_list; struct list_head *pos, *n; struct obd_export *exp; - int cnt = 0; ENTRY; CFS_INIT_LIST_HEAD(&work_list); spin_lock(&obd->obd_dev_lock); + obd->obd_stale_clients = 0; list_for_each_safe(pos, n, &obd->obd_exports) { exp = list_entry(pos, struct obd_export, exp_obd_chain); if (test_export(exp)) @@ -1180,7 +1180,7 @@ int class_disconnect_stale_exports(struct obd_device *obd, &exp->exp_obd->obd_uuid)) continue; - cnt++; + obd->obd_stale_clients++; CDEBUG(D_ERROR, "%s: disconnect stale client %s@%s\n", obd->obd_name, exp->exp_client_uuid.uuid, exp->exp_connection == NULL ? "" : @@ -1188,10 +1188,11 @@ int class_disconnect_stale_exports(struct obd_device *obd, } spin_unlock(&obd->obd_dev_lock); - CDEBUG(D_ERROR, "%s: disconnecting %d stale clients\n", - obd->obd_name, cnt); + CDEBUG(D_HA, "%s: disconnecting %d stale clients\n", obd->obd_name, + obd->obd_stale_clients); + class_disconnect_export_list(&work_list, flags); - RETURN(cnt); + EXIT; } EXPORT_SYMBOL(class_disconnect_stale_exports); diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index 2bf5ae4..b19c0bb 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -2171,17 +2171,14 @@ int filter_common_setup(struct obd_device *obd, struct lustre_cfg* lcfg, if (obd->obd_recovering) { LCONSOLE_WARN("OST %s now serving %s (%s%s%s), but will be in " "recovery for at least %d:%.02d, or until %d " - "client%s reconnect. During this time new clients" - " will not be allowed to connect. " - "Recovery progress can be monitored by watching " - "/proc/fs/lustre/obdfilter/%s/recovery_status.\n", + "client%s reconnect%s.\n", obd->obd_name, lustre_cfg_string(lcfg, 1), label ?: "", label ? "/" : "", str, obd->obd_recovery_timeout / 60, obd->obd_recovery_timeout % 60, obd->obd_max_recoverable_clients, (obd->obd_max_recoverable_clients == 1) ? "":"s", - obd->obd_name); + (obd->obd_max_recoverable_clients == 1) ? "s":""); } else { LCONSOLE_INFO("OST %s now serving %s (%s%s%s) with recovery " "%s\n", obd->obd_name, lustre_cfg_string(lcfg, 1), @@ -3584,7 +3581,7 @@ static int filter_destroy_precreated(struct obd_export *exp, struct obdo *oa, skip_orphan = !!(exp->exp_connect_flags & OBD_CONNECT_SKIP_ORPHAN); - CWARN("%s: deleting orphan objects from "LPU64" to "LPU64"%s\n", + CDEBUG(D_HA, "%s: deleting orphan objects from "LPU64" to "LPU64"%s\n", exp->exp_obd->obd_name, oa->o_id + 1, last, skip_orphan ? ", orphan objids won't be reused any more." : "."); @@ -4500,7 +4497,7 @@ int filter_iocontrol(unsigned int cmd, struct obd_export *exp, switch (cmd) { case OBD_IOC_ABORT_RECOVERY: { - CERROR("aborting recovery for device %s\n", obd->obd_name); + LCONSOLE_WARN("%s: Aborting recovery.\n", obd->obd_name); target_stop_recovery_thread(obd); RETURN(0); } diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c index 39338ca..a851c35 100644 --- a/lustre/osc/osc_create.c +++ b/lustre/osc/osc_create.c @@ -381,7 +381,8 @@ int osc_create(struct obd_export *exp, struct obdo *oa, oscc->oscc_last_id = oa->o_id; ocd = &imp->imp_connect_data; if (ocd->ocd_connect_flags & OBD_CONNECT_SKIP_ORPHAN) { - CWARN("Skip orphan set, reset the last objid\n"); + CDEBUG(D_HA, "%s: Skip orphan set, reset last " + "objid\n", oscc->oscc_obd->obd_name); oscc->oscc_next_id = oa->o_id + 1; } diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index e043a9e..aaad618 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -914,6 +914,31 @@ static int ptlrpc_check_reply(struct ptlrpc_request *req) return rc; } +/* Conditionally suppress specific console messages */ +static int ptlrpc_console_allow(struct ptlrpc_request *req) +{ + __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + int err; + + /* Suppress particular reconnect errors which are to be expected. No + * errors are suppressed for the initial connection on an import */ + if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) && + (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) { + + /* Suppress timed out reconnect requests */ + if (req->rq_timedout) + return 0; + + /* Suppress unavailable/again reconnect requests */ + err = lustre_msg_get_status(req->rq_repmsg); + if (err == -ENODEV || err == -EAGAIN) + return 0; + } + + return 1; +} + + static int ptlrpc_check_status(struct ptlrpc_request *req) { int err; @@ -937,6 +962,21 @@ static int ptlrpc_check_status(struct ptlrpc_request *req) DEBUG_REQ(D_INFO, req, "status is %d", err); } + if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) { + struct obd_import *imp = req->rq_import; + __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + + if (ptlrpc_console_allow(req)) + LCONSOLE_ERROR_MSG(0x011,"an error occurred while " + "communicating with %s. The %s " + "operation failed with %d\n", + libcfs_nid2str( + imp->imp_connection->c_peer.nid), + ll_opcode2str(opc), err); + + RETURN(err < 0 ? err : -EINVAL); + } + RETURN(err); } @@ -1468,6 +1508,10 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink) int rc = 0; ENTRY; + spin_lock(&req->rq_lock); + req->rq_timedout = 1; + spin_unlock(&req->rq_lock); + DEBUG_REQ(D_WARNING, req, "Request x"LPU64" sent from %s to NID %s " CFS_DURATION_T"s ago has %s (limit "CFS_DURATION_T"s).\n", req->rq_xid, imp ? imp->imp_obd->obd_name : "", @@ -1479,10 +1523,6 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink) if (imp != NULL && obd_debug_peer_on_timeout) LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer); - spin_lock(&req->rq_lock); - req->rq_timedout = 1; - spin_unlock(&req->rq_lock); - ptlrpc_unregister_reply(req, async_unlink); ptlrpc_unregister_bulk(req, async_unlink); diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 5ddb3c6..4824569 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -526,10 +526,9 @@ static int import_select_connection(struct obd_import *imp) if (imp->imp_conn_current != imp_conn) { if (imp->imp_conn_current) - LCONSOLE_INFO("Changing connection for %s to %s/%s\n", - imp->imp_obd->obd_name, - imp_conn->oic_uuid.uuid, - libcfs_nid2str(imp_conn->oic_conn->c_peer.nid)); + CDEBUG(D_HA, "Changing connection for %s to %s/%s\n", + imp->imp_obd->obd_name, imp_conn->oic_uuid.uuid, + libcfs_nid2str(imp_conn->oic_conn->c_peer.nid)); imp->imp_conn_current = imp_conn; } -- 1.8.3.1