From 088215ea2c45f4b71f5e0474c84fcb47643bd8e8 Mon Sep 17 00:00:00 2001 From: jacob Date: Tue, 15 Feb 2005 21:31:13 +0000 Subject: [PATCH] b=5684 r=adilger First stab at improving some user visible error messages. --- lustre/ldlm/ldlm_lockd.c | 11 +++++++--- lustre/mds/handler.c | 23 ++++++++++++++++++++ lustre/obdfilter/filter.c | 23 ++++++++++++++++++++ lustre/ptlrpc/import.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++ lustre/utils/lconf | 3 ++- 5 files changed, 111 insertions(+), 4 deletions(-) diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 214e9df..301b6af 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -315,15 +315,20 @@ int ldlm_del_waiting_lock(struct ldlm_lock *lock) #endif /* __KERNEL__ */ -static void ldlm_failed_ast(struct ldlm_lock *lock, int rc,const char *ast_type) +static void ldlm_failed_ast(struct ldlm_lock *lock, int rc, + const char *ast_type) { struct ptlrpc_connection *conn = lock->l_export->exp_connection; char str[PTL_NALFMT_SIZE]; + ptlrpc_peernid2str(&conn->c_peer, str); + + LCONSOLE_ERROR("A client on nid %s was evicted from service %s.\n", + str, lock->l_export->exp_obd->obd_name); + LDLM_ERROR(lock, "%s AST failed (%d): evicting client %s@%s NID "LPX64 " (%s)", ast_type, rc, lock->l_export->exp_client_uuid.uuid, - conn->c_remote_uuid.uuid, conn->c_peer.peer_id.nid, - ptlrpc_peernid2str(&conn->c_peer, str)); + conn->c_remote_uuid.uuid, conn->c_peer.peer_id.nid, str); ptlrpc_fail_export(lock->l_export); } diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index 4fdfa6b..2cf39f1 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -1477,6 +1477,27 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) lprocfs_init_vars(mds, &lvars); lprocfs_obd_setup(obd, lvars.obd_vars); + if (obd->obd_recovering) { + LCONSOLE_WARN("MDT %s now serving %s, but will be in recovery " + "until %d %s reconnect, or if no clients " + "reconnect for %d:%.02d; during that time new " + "clients will not be allowed to connect. " + "Recovery progress can be monitored by watching " + "/proc/fs/lustre/mds/%s/recovery_status.\n", + obd->obd_name, + lcfg->lcfg_inlbuf1, + obd->obd_recoverable_clients, + (obd->obd_recoverable_clients == 1) + ? "client" : "clients", + (int)(OBD_RECOVERY_TIMEOUT / HZ) / 60, + (int)(OBD_RECOVERY_TIMEOUT / HZ) % 60, + obd->obd_name); + } else { + LCONSOLE_INFO("MDT %s now serving %s with recovery %s.\n", + obd->obd_name, lcfg->lcfg_inlbuf1, + obd->obd_replayable ? "enabled" : "disabled"); + } + RETURN(0); err_fs: @@ -1676,6 +1697,8 @@ static int mds_cleanup(struct obd_device *obd, int flags) dev_clear_rdonly(2); fsfilt_put_ops(obd->obd_fsops); + LCONSOLE_INFO("MDT %s has stopped.\n", obd->obd_name); + RETURN(0); } diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index fa487a8..1f1f477 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -1273,6 +1273,27 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf, GOTO(err_post, rc); } + if (obd->obd_recovering) { + LCONSOLE_WARN("OST %s now serving %s, but will be in recovery " + "until %d %s reconnect, or if no clients " + "reconnect for %d:%.02d; during that time new " + "clients will not be allowed to connect. " + "Recovery progress can be monitored by watching " + "/proc/fs/lustre/obdfilter/%s/recovery_status.\n", + obd->obd_name, + lcfg->lcfg_inlbuf1, + obd->obd_recoverable_clients, + (obd->obd_recoverable_clients == 1) + ? "client" : "clients", + (int)(OBD_RECOVERY_TIMEOUT / HZ) / 60, + (int)(OBD_RECOVERY_TIMEOUT / HZ) % 60, + obd->obd_name); + } else { + LCONSOLE_INFO("OST %s now serving %s with recovery %s.\n", + obd->obd_name, lcfg->lcfg_inlbuf1, + obd->obd_replayable ? "enabled" : "disabled"); + } + RETURN(0); err_post: @@ -1361,6 +1382,8 @@ static int filter_cleanup(struct obd_device *obd, int flags) dev_clear_rdonly(2); + LCONSOLE_INFO("OST %s has stopped.\n", obd->obd_name); + RETURN(0); } diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 5a74685..f2922f3 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -89,6 +89,22 @@ int ptlrpc_init_import(struct obd_import *imp) return 0; } +#define UUID_STR "_UUID" +static void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uuid_len) +{ + *uuid_start = !prefix || strncmp(uuid, prefix, strlen(prefix)) + ? uuid : uuid + strlen(prefix); + + *uuid_len = strlen(*uuid_start); + + if (*uuid_len < strlen(UUID_STR)) + return; + + if (!strncmp(*uuid_start + *uuid_len - strlen(UUID_STR), + UUID_STR, strlen(UUID_STR))) + *uuid_len -= strlen(UUID_STR); +} + /* Returns true if import was FULL, false if import was already not * connected. */ @@ -100,6 +116,23 @@ int ptlrpc_set_import_discon(struct obd_import *imp) spin_lock_irqsave(&imp->imp_lock, flags); if (imp->imp_state == LUSTRE_IMP_FULL) { + char nidbuf[PTL_NALFMT_SIZE]; + char *target_start; + int target_len; + + deuuidify(imp->imp_target_uuid.uuid, NULL, + &target_start, &target_len); + + LCONSOLE_ERROR("Connection to service %.*s via nid %s was " + "lost; in progress operations using this " + "service will %s.\n", + target_len, target_start, + ptlrpc_peernid2str(&imp->imp_connection->c_peer, + nidbuf), + imp->imp_replayable + ? "wait for recovery to complete" + : "fail"); + CWARN("%s: connection lost to %s@%s\n", imp->imp_obd->obd_name, imp->imp_target_uuid.uuid, @@ -519,8 +552,18 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp) { int rc = 0; int inflight; + char *target_start; + int target_len; if (imp->imp_state == LUSTRE_IMP_EVICTED) { + deuuidify(imp->imp_target_uuid.uuid, NULL, + &target_start, &target_len); + LCONSOLE_ERROR("This client was evicted by %.*s; in progress " + "operations using this service will %s.\n", + target_len, target_start, + imp->imp_replayable + ? "be reattempted" + : "fail"); CDEBUG(D_HA, "evicted from %s@%s; invalidating\n", imp->imp_target_uuid.uuid, imp->imp_connection->c_remote_uuid.uuid); @@ -563,6 +606,8 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp) } if (imp->imp_state == LUSTRE_IMP_RECOVER) { + char nidbuf[PTL_NALFMT_SIZE]; + CDEBUG(D_HA, "reconnected to %s@%s\n", imp->imp_target_uuid.uuid, imp->imp_connection->c_remote_uuid.uuid); @@ -572,6 +617,16 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp) GOTO(out, rc); IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL); ptlrpc_activate_import(imp); + + deuuidify(imp->imp_target_uuid.uuid, NULL, + &target_start, &target_len); + ptlrpc_peernid2str(&imp->imp_connection->c_peer, + nidbuf); + + LCONSOLE_INFO("Connection restored to service %.*s using nid " + "%s.\n", + target_len, target_start, nidbuf); + CWARN("%s: connection restored to %s@%s\n", imp->imp_obd->obd_name, imp->imp_target_uuid.uuid, diff --git a/lustre/utils/lconf b/lustre/utils/lconf index a9eb5b0..1da1014 100755 --- a/lustre/utils/lconf +++ b/lustre/utils/lconf @@ -88,6 +88,7 @@ ptldebug_names = { "reada" : (1 << 22), "mmap" : (1 << 23), "config" : (1 << 24), + "console" : (1 << 25), } subsystem_names = { @@ -799,7 +800,7 @@ def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1): else: real_dev = os.path.join(os.path.dirname(real_dev), dev_link) if link_count > 19: - panic("Entountered too many symbolic links resolving block device:", dev) + panic("Encountered too many symbolic links resolving block device:", dev) # get the major and minor number of the realpath via ls # it seems python(os.stat) does not return -- 1.8.3.1