Whamcloud - gitweb
�� b=18948
authoranserper <anserper>
Tue, 5 May 2009 21:08:28 +0000 (21:08 +0000)
committeranserper <anserper>
Tue, 5 May 2009 21:08:28 +0000 (21:08 +0000)
�� o=Brian Behlendorf
�� i=Nathan Rutman
�� i=Robert Read
��
�� Recovery console messages cleanup

lustre/include/obd.h
lustre/ldlm/ldlm_lib.c
lustre/mds/handler.c
lustre/mds/mds_lov.c
lustre/obdclass/genops.c
lustre/obdfilter/filter.c
lustre/osc/osc_create.c
lustre/ptlrpc/client.c
lustre/ptlrpc/import.c

index 8101b35..cfeb383 100644 (file)
@@ -975,6 +975,7 @@ struct obd_device {
         int                              obd_max_recoverable_clients;
         int                              obd_connected_clients;
         int                              obd_recoverable_clients;
+        int                              obd_stale_clients;
         int                              obd_delayed_clients;
         spinlock_t                       obd_processing_task_lock; /* BH lock (timer) */
         pid_t                            obd_processing_task;
index e66e2aa..57f3f2e 100644 (file)
@@ -1115,7 +1115,17 @@ static void target_release_saved_req(struct ptlrpc_request *req)
 
 static void target_send_delayed_replies(struct obd_device *obd)
 {
+        int max_clients = obd->obd_max_recoverable_clients;
         struct ptlrpc_request *req, *tmp;
+        time_t elapsed_time = max_t(time_t, 1, cfs_time_current_sec() -
+                                    obd->obd_recovery_start);
+
+        LCONSOLE_INFO("%s: Recovery period over after %d:%.02d, of %d clients "
+                      "%d recovered and %d %s evicted.\n", obd->obd_name,
+                      (int)elapsed_time/60, (int)elapsed_time%60, max_clients,
+                      max_clients - obd->obd_recoverable_clients,
+                      obd->obd_stale_clients,
+                      obd->obd_stale_clients == 1 ? "was" : "were");
 
         LCONSOLE_INFO("%s: sending delayed replies to recovered clients\n",
                       obd->obd_name);
@@ -1147,8 +1157,9 @@ static void target_finish_recovery(struct obd_device *obd)
         /* when recovery finished, cleanup orphans on mds and ost */
         if (OBT(obd) && OBP(obd, postrecov)) {
                 int rc = OBP(obd, postrecov)(obd);
-                LCONSOLE_WARN("%s: recovery %s: rc %d\n", obd->obd_name,
-                              rc < 0 ? "failed" : "complete", rc);
+                if (rc < 0)
+                        LCONSOLE_WARN("%s: Post recovery failed, rc %d\n",
+                                      obd->obd_name, rc);
         }
         target_send_delayed_replies(obd);
 }
@@ -1272,11 +1283,11 @@ static void reset_recovery_timer(struct obd_device *, int, int);
 static void target_recovery_expired(unsigned long castmeharder)
 {
         struct obd_device *obd = (struct obd_device *)castmeharder;
-        LCONSOLE_WARN("%s: recovery period over; %d clients never reconnected "
-                      "after %lds (%d clients did)\n",
-                      obd->obd_name, obd->obd_recoverable_clients,
-                      cfs_time_current_sec()- obd->obd_recovery_start,
-                      obd->obd_connected_clients);
+        CDEBUG(D_HA, "%s: recovery period over; %d clients never reconnected "
+               "after %lds (%d clients did)\n", obd->obd_name,
+               obd->obd_recoverable_clients,
+               cfs_time_current_sec() - obd->obd_recovery_start,
+               obd->obd_connected_clients);
 
         spin_lock_bh(&obd->obd_processing_task_lock);
         obd->obd_abort_recovery = 1;
@@ -1340,7 +1351,7 @@ static void check_and_start_recovery_timer(struct obd_device *obd,
                 spin_unlock_bh(&obd->obd_processing_task_lock);
                 return;
         }
-        CWARN("%s: starting recovery timer\n", obd->obd_name);
+        CDEBUG(D_HA, "%s: starting recovery timer\n", obd->obd_name);
         obd->obd_recovery_start = cfs_time_current_sec();
         /* minimum */
         obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout;
index 7d4396d..4b02b98 100644 (file)
@@ -2103,17 +2103,14 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
         if (obd->obd_recovering) {
                 LCONSOLE_WARN("MDT %s now serving %s (%s%s%s), but will be in "
                               "recovery for at least %d:%.02d, or until %d "
-                              "client%s reconnect. During this time new clients"
-                              " will not be allowed to connect. "
-                              "Recovery progress can be monitored by watching "
-                              "/proc/fs/lustre/mds/%s/recovery_status.\n",
+                              "client%s reconnect%s. \n",
                               obd->obd_name, lustre_cfg_string(lcfg, 1),
                               label ?: "", label ? "/" : "", str,
                               obd->obd_recovery_timeout / 60,
                               obd->obd_recovery_timeout % 60,
                               obd->obd_recoverable_clients,
                               (obd->obd_recoverable_clients == 1) ? "":"s",
-                              obd->obd_name);
+                              (obd->obd_recoverable_clients == 1) ? "s":"");
         } else {
                 LCONSOLE_INFO("MDT %s now serving %s (%s%s%s) with recovery "
                               "%s\n", obd->obd_name, lustre_cfg_string(lcfg, 1),
index c9270ab..45ff9f7 100644 (file)
@@ -923,7 +923,7 @@ int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
         }
 
         case OBD_IOC_ABORT_RECOVERY:
-                CERROR("aborting recovery for device %s\n", obd->obd_name);
+                LCONSOLE_WARN("%s: Aborting recovery.\n", obd->obd_name);
                 target_abort_recovery(obd);
                 /* obd_recovering has been changed */
                 mds_allow_cli(obd, 0);
index cdae4f4..fe78085 100644 (file)
@@ -1059,22 +1059,22 @@ void class_disconnect_stale_exports(struct obd_device *obd,
         struct list_head work_list;
         struct list_head *pos, *n;
         struct obd_export *exp;
-        int cnt = 0;
         ENTRY;
 
         CFS_INIT_LIST_HEAD(&work_list);
         spin_lock(&obd->obd_dev_lock);
+        obd->obd_stale_clients = 0;
         list_for_each_safe(pos, n, &obd->obd_exports) {
                 exp = list_entry(pos, struct obd_export, exp_obd_chain);
                 if (exp->exp_replay_needed) {
                         list_move(&exp->exp_obd_chain, &work_list);
-                        cnt++;
+                        obd->obd_stale_clients++;
                 }
         }
         spin_unlock(&obd->obd_dev_lock);
 
-        CDEBUG(D_ERROR, "%s: disconnecting %d stale clients\n",
-               obd->obd_name, cnt);
+        CDEBUG(D_HA, "%s: disconnecting %d stale clients\n",
+               obd->obd_name, obd->obd_stale_clients);
         class_disconnect_export_list(&work_list, flags);
         EXIT;
 }
index 5f49442..0337d6b 100644 (file)
@@ -1990,17 +1990,14 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf,
         if (obd->obd_recovering) {
                 LCONSOLE_WARN("OST %s now serving %s (%s%s%s), but will be in "
                               "recovery for at least %d:%.02d, or until %d "
-                              "client%s reconnect. During this time new clients"
-                              " will not be allowed to connect. "
-                              "Recovery progress can be monitored by watching "
-                              "/proc/fs/lustre/obdfilter/%s/recovery_status.\n",
+                              "client%s reconnect%s.\n",
                               obd->obd_name, lustre_cfg_string(lcfg, 1),
                               label ?: "", label ? "/" : "", str,
                               obd->obd_recovery_timeout / 60,
                               obd->obd_recovery_timeout % 60,
                               obd->obd_recoverable_clients,
-                              (obd->obd_recoverable_clients == 1) ? "":"s",
-                              obd->obd_name);
+                              obd->obd_recoverable_clients == 1 ? "":"s",
+                              obd->obd_recoverable_clients == 1 ? "s":"");
         } else {
                 LCONSOLE_INFO("OST %s now serving %s (%s%s%s) with recovery "
                               "%s\n", obd->obd_name, lustre_cfg_string(lcfg, 1),
index a5ce2f2..af45e6c 100644 (file)
@@ -376,7 +376,8 @@ int osc_create(struct obd_export *exp, struct obdo *oa,
                         oscc->oscc_last_id = oa->o_id;
                         ocd = &imp->imp_connect_data;
                         if (ocd->ocd_connect_flags & OBD_CONNECT_SKIP_ORPHAN) {
-                                CWARN("Skip orphan set, reset last objid\n");
+                                CDEBUG(D_HA, "%s: Skip orphan set, reset last "
+                                       "objid\n", oscc->oscc_obd->obd_name);
                                 oscc->oscc_next_id = oa->o_id + 1;
                         }
 
index ff0aa14..9e77607 100644 (file)
@@ -859,23 +859,36 @@ static int ptlrpc_check_reply(struct ptlrpc_request *req)
         return rc;
 }
 
-static int ptlrpc_check_status(struct ptlrpc_request *req)
+/* Conditionally suppress specific console messages */
+static int ptlrpc_console_allow(struct ptlrpc_request *req)
 {
+        __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
         int err;
-        ENTRY;
 
-        err = lustre_msg_get_status(req->rq_repmsg);
-        if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
-                struct obd_import *imp = req->rq_import;
-                __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+        /* Suppress particular reconnect errors which are to be expected.  No
+         * errors are suppressed for the initial connection on an import */
+        if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) &&
+            (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) {
 
-                LCONSOLE_ERROR_MSG(0x011,"an error occurred while communicating"
-                               " with %s. The %s operation failed with %d\n",
-                               libcfs_nid2str(imp->imp_connection->c_peer.nid),
-                               ll_opcode2str(opc), err);
-                RETURN(err < 0 ? err : -EINVAL);
+                /* Suppress timed out reconnect requests */
+                if (req->rq_timedout)
+                        return 0;
+
+                /* Suppress unavailable/again reconnect requests */
+                err = lustre_msg_get_status(req->rq_repmsg);
+                if (err == -ENODEV || err == -EAGAIN)
+                        return 0;
         }
 
+        return 1;
+}
+
+static int ptlrpc_check_status(struct ptlrpc_request *req)
+{
+        int err;
+        ENTRY;
+
+        err = lustre_msg_get_status(req->rq_repmsg);
         if (err < 0) {
                 DEBUG_REQ(D_INFO, req, "status is %d", err);
         } else if (err > 0) {
@@ -883,6 +896,21 @@ static int ptlrpc_check_status(struct ptlrpc_request *req)
                 DEBUG_REQ(D_INFO, req, "status is %d", err);
         }
 
+        if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
+                struct obd_import *imp = req->rq_import;
+                __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+                if (ptlrpc_console_allow(req))
+                        LCONSOLE_ERROR_MSG(0x011,"an error occurred while "
+                                           "communicating with %s. The %s "
+                                           "operation failed with %d\n",
+                                           libcfs_nid2str(
+                                           imp->imp_connection->c_peer.nid),
+                                           ll_opcode2str(opc), err);
+
+                RETURN(err < 0 ? err : -EINVAL);
+        }
+
         RETURN(err);
 }
 
index 38caf6a..f7cc6df 100644 (file)
@@ -524,10 +524,9 @@ static int import_select_connection(struct obd_import *imp)
 
         if (imp->imp_conn_current != imp_conn) {
                 if (imp->imp_conn_current)
-                        LCONSOLE_INFO("Changing connection for %s to %s/%s\n",
-                                      imp->imp_obd->obd_name,
-                                      imp_conn->oic_uuid.uuid,
-                                      libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
+                        CDEBUG(D_HA, "Changing connection for %s to %s/%s\n",
+                               imp->imp_obd->obd_name, imp_conn->oic_uuid.uuid,
+                               libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
                 imp->imp_conn_current = imp_conn;
         }