Whamcloud - gitweb
b=5684
authorjacob <jacob>
Tue, 15 Feb 2005 21:31:13 +0000 (21:31 +0000)
committerjacob <jacob>
Tue, 15 Feb 2005 21:31:13 +0000 (21:31 +0000)
r=adilger

First stab at improving some user visible error messages.

lustre/ldlm/ldlm_lockd.c
lustre/mds/handler.c
lustre/obdfilter/filter.c
lustre/ptlrpc/import.c
lustre/utils/lconf

index 214e9df..301b6af 100644 (file)
@@ -315,15 +315,20 @@ int ldlm_del_waiting_lock(struct ldlm_lock *lock)
 
 #endif /* __KERNEL__ */
 
-static void ldlm_failed_ast(struct ldlm_lock *lock, int rc,const char *ast_type)
+static void ldlm_failed_ast(struct ldlm_lock *lock, int rc,
+                            const char *ast_type)
 {
         struct ptlrpc_connection *conn = lock->l_export->exp_connection;
         char str[PTL_NALFMT_SIZE];
 
+        ptlrpc_peernid2str(&conn->c_peer, str);
+
+        LCONSOLE_ERROR("A client on nid %s was evicted from service %s.\n",
+                       str, lock->l_export->exp_obd->obd_name);
+
         LDLM_ERROR(lock, "%s AST failed (%d): evicting client %s@%s NID "LPX64
                    " (%s)", ast_type, rc, lock->l_export->exp_client_uuid.uuid,
-                   conn->c_remote_uuid.uuid, conn->c_peer.peer_id.nid,
-                   ptlrpc_peernid2str(&conn->c_peer, str));
+                   conn->c_remote_uuid.uuid, conn->c_peer.peer_id.nid, str);
 
         ptlrpc_fail_export(lock->l_export);
 }
index 4fdfa6b..2cf39f1 100644 (file)
@@ -1477,6 +1477,27 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
         lprocfs_init_vars(mds, &lvars);
         lprocfs_obd_setup(obd, lvars.obd_vars);
 
+        if (obd->obd_recovering) {
+                LCONSOLE_WARN("MDT %s now serving %s, but will be in recovery "
+                              "until %d %s reconnect, or if no clients "
+                              "reconnect for %d:%.02d; during that time new "
+                              "clients will not be allowed to connect. "
+                              "Recovery progress can be monitored by watching "
+                              "/proc/fs/lustre/mds/%s/recovery_status.\n",
+                              obd->obd_name,
+                              lcfg->lcfg_inlbuf1,
+                              obd->obd_recoverable_clients,
+                              (obd->obd_recoverable_clients == 1) 
+                              ? "client" : "clients",
+                              (int)(OBD_RECOVERY_TIMEOUT / HZ) / 60,
+                              (int)(OBD_RECOVERY_TIMEOUT / HZ) % 60,
+                              obd->obd_name);
+        } else {
+                LCONSOLE_INFO("MDT %s now serving %s with recovery %s.\n",
+                              obd->obd_name, lcfg->lcfg_inlbuf1,
+                              obd->obd_replayable ? "enabled" : "disabled");
+        }
+
         RETURN(0);
 
 err_fs:
@@ -1676,6 +1697,8 @@ static int mds_cleanup(struct obd_device *obd, int flags)
         dev_clear_rdonly(2);
         fsfilt_put_ops(obd->obd_fsops);
 
+        LCONSOLE_INFO("MDT %s has stopped.\n", obd->obd_name);
+
         RETURN(0);
 }
 
index fa487a8..1f1f477 100644 (file)
@@ -1273,6 +1273,27 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf,
                 GOTO(err_post, rc);
         }
 
+        if (obd->obd_recovering) {
+                LCONSOLE_WARN("OST %s now serving %s, but will be in recovery "
+                              "until %d %s reconnect, or if no clients "
+                              "reconnect for %d:%.02d; during that time new "
+                              "clients will not be allowed to connect. "
+                              "Recovery progress can be monitored by watching "
+                              "/proc/fs/lustre/obdfilter/%s/recovery_status.\n",
+                              obd->obd_name,
+                              lcfg->lcfg_inlbuf1,
+                              obd->obd_recoverable_clients,
+                              (obd->obd_recoverable_clients == 1) 
+                              ? "client" : "clients",
+                              (int)(OBD_RECOVERY_TIMEOUT / HZ) / 60,
+                              (int)(OBD_RECOVERY_TIMEOUT / HZ) % 60,
+                              obd->obd_name);
+        } else {
+                LCONSOLE_INFO("OST %s now serving %s with recovery %s.\n",
+                              obd->obd_name, lcfg->lcfg_inlbuf1,
+                              obd->obd_replayable ? "enabled" : "disabled");
+        }
+
         RETURN(0);
 
 err_post:
@@ -1361,6 +1382,8 @@ static int filter_cleanup(struct obd_device *obd, int flags)
 
         dev_clear_rdonly(2);
 
+        LCONSOLE_INFO("OST %s has stopped.\n", obd->obd_name);
+
         RETURN(0);
 }
 
index 5a74685..f2922f3 100644 (file)
@@ -89,6 +89,22 @@ int ptlrpc_init_import(struct obd_import *imp)
         return 0;
 }
 
+#define UUID_STR "_UUID"
+static void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uuid_len)
+{
+        *uuid_start = !prefix || strncmp(uuid, prefix, strlen(prefix))
+                ? uuid : uuid + strlen(prefix);
+
+        *uuid_len = strlen(*uuid_start);
+
+        if (*uuid_len < strlen(UUID_STR))
+                return;
+        
+        if (!strncmp(*uuid_start + *uuid_len - strlen(UUID_STR),
+                    UUID_STR, strlen(UUID_STR)))
+                *uuid_len -= strlen(UUID_STR);
+}
+
 /* Returns true if import was FULL, false if import was already not
  * connected.
  */
@@ -100,6 +116,23 @@ int ptlrpc_set_import_discon(struct obd_import *imp)
         spin_lock_irqsave(&imp->imp_lock, flags);
 
         if (imp->imp_state == LUSTRE_IMP_FULL) {
+                char nidbuf[PTL_NALFMT_SIZE];
+                char *target_start;
+                int   target_len;
+
+                deuuidify(imp->imp_target_uuid.uuid, NULL,
+                          &target_start, &target_len);
+
+                LCONSOLE_ERROR("Connection to service %.*s via nid %s was "
+                               "lost; in progress operations using this "
+                               "service will %s.\n",
+                               target_len, target_start,
+                               ptlrpc_peernid2str(&imp->imp_connection->c_peer,
+                                                  nidbuf),
+                               imp->imp_replayable 
+                               ? "wait for recovery to complete"
+                               : "fail");
+
                 CWARN("%s: connection lost to %s@%s\n",
                       imp->imp_obd->obd_name,
                       imp->imp_target_uuid.uuid,
@@ -519,8 +552,18 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 {
         int rc = 0;
         int inflight;
+        char *target_start;
+        int target_len;
 
         if (imp->imp_state == LUSTRE_IMP_EVICTED) {
+                deuuidify(imp->imp_target_uuid.uuid, NULL,
+                          &target_start, &target_len);
+                LCONSOLE_ERROR("This client was evicted by %.*s; in progress "
+                               "operations using this service will %s.\n",
+                               target_len, target_start,
+                               imp->imp_replayable
+                               ? "be reattempted"
+                               : "fail");
                 CDEBUG(D_HA, "evicted from %s@%s; invalidating\n",
                        imp->imp_target_uuid.uuid,
                        imp->imp_connection->c_remote_uuid.uuid);
@@ -563,6 +606,8 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
         }
 
         if (imp->imp_state == LUSTRE_IMP_RECOVER) {
+                char nidbuf[PTL_NALFMT_SIZE];
+
                 CDEBUG(D_HA, "reconnected to %s@%s\n",
                        imp->imp_target_uuid.uuid,
                        imp->imp_connection->c_remote_uuid.uuid);
@@ -572,6 +617,16 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
                         GOTO(out, rc);
                 IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
                 ptlrpc_activate_import(imp);
+
+                deuuidify(imp->imp_target_uuid.uuid, NULL,
+                          &target_start, &target_len);
+                ptlrpc_peernid2str(&imp->imp_connection->c_peer,
+                                   nidbuf);
+
+                LCONSOLE_INFO("Connection restored to service %.*s using nid "
+                              "%s.\n",
+                              target_len, target_start, nidbuf);
+
                 CWARN("%s: connection restored to %s@%s\n",
                       imp->imp_obd->obd_name,
                       imp->imp_target_uuid.uuid,
index a9eb5b0..1da1014 100755 (executable)
@@ -88,6 +88,7 @@ ptldebug_names = {
     "reada" :     (1 << 22),
     "mmap" :      (1 << 23),
     "config" :    (1 << 24),
+    "console" :   (1 << 25),
     }
 
 subsystem_names = {
@@ -799,7 +800,7 @@ def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
                                 else:
                                     real_dev = os.path.join(os.path.dirname(real_dev), dev_link)
                                 if link_count > 19:
-                                    panic("Entountered too many symbolic links resolving block device:", dev)
+                                    panic("Encountered too many symbolic links resolving block device:", dev)
 
                         # get the major and minor number of the realpath via ls
                         # it seems python(os.stat) does not return