Whamcloud - gitweb
b=18948 ; revert the commit
authoranserper <anserper>
Tue, 5 May 2009 13:52:56 +0000 (13:52 +0000)
committeranserper <anserper>
Tue, 5 May 2009 13:52:56 +0000 (13:52 +0000)
lustre/include/obd.h
lustre/ldlm/ldlm_lib.c
lustre/mds/handler.c
lustre/mds/mds_lov.c
lustre/obdclass/genops.c
lustre/obdfilter/filter.c
lustre/osc/osc_create.c
lustre/ptlrpc/client.c
lustre/ptlrpc/import.c

index 4dac00d..a8d10b4 100644 (file)
@@ -895,7 +895,6 @@ struct obd_device {
         int                              obd_max_recoverable_clients;
         int                              obd_connected_clients;
         int                              obd_recoverable_clients;
-        int                              obd_stale_clients;
         spinlock_t                       obd_processing_task_lock; /* BH lock (timer) */
         pid_t                            obd_processing_task;
         /* thread to handle recovery queue */
index 6518523..0af549c 100644 (file)
@@ -1055,17 +1055,7 @@ static void target_release_saved_req(struct ptlrpc_request *req)
 
 static void target_finish_recovery(struct obd_device *obd)
 {
-        int max_clients = obd->obd_max_recoverable_clients;
         struct ptlrpc_request *req, *tmp;
-        time_t elapsed_time = max_t(time_t, 1, cfs_time_current_sec() -
-                                    obd->obd_recovery_start);
-
-        LCONSOLE_INFO("%s: Recovery period over after %d:%.02d, of %d clients "
-                      "%d recovered and %d %s evicted.\n", obd->obd_name,
-                      (int)elapsed_time/60, (int)elapsed_time%60, max_clients,
-                      max_clients - obd->obd_recoverable_clients,
-                      obd->obd_stale_clients,
-                      obd->obd_stale_clients == 1 ? "was" : "were");
 
         ldlm_reprocess_all_ns(obd->obd_namespace);
         spin_lock_bh(&obd->obd_processing_task_lock);
@@ -1081,9 +1071,8 @@ static void target_finish_recovery(struct obd_device *obd)
         /* when recovery finished, cleanup orphans on mds and ost */
         if (OBT(obd) && OBP(obd, postrecov)) {
                 int rc = OBP(obd, postrecov)(obd);
-                if (rc < 0)
-                        LCONSOLE_WARN("%s: Post recovery failed, rc %d\n",
-                                      obd->obd_name, rc);
+                LCONSOLE_WARN("%s: recovery %s: rc %d\n", obd->obd_name,
+                              rc < 0 ? "failed" : "complete", rc);
         }
 
         LCONSOLE_INFO("%s: sending delayed replies to recovered clients\n",
@@ -1189,6 +1178,11 @@ void target_abort_recovery(void *data)
         target_cancel_recovery_timer(obd);
         spin_unlock_bh(&obd->obd_processing_task_lock);
 
+        LCONSOLE_WARN("%s: recovery period over; %d clients never reconnected "
+                      "after %lds (%d clients did)\n",
+                      obd->obd_name, obd->obd_recoverable_clients,
+                      cfs_time_current_sec()- obd->obd_recovery_start,
+                      obd->obd_connected_clients);
         class_disconnect_stale_exports(obd, flags);
         abort_recovery_queue(obd);
 
@@ -1200,7 +1194,7 @@ void target_abort_recovery(void *data)
 static void target_recovery_expired(unsigned long castmeharder)
 {
         struct obd_device *obd = (struct obd_device *)castmeharder;
-        CDEBUG(D_HA, "%s: recovery timed out, aborting\n", obd->obd_name);
+        CERROR("%s: recovery timed out, aborting\n", obd->obd_name);
         spin_lock_bh(&obd->obd_processing_task_lock);
         if (obd->obd_recovering)
                 obd->obd_abort_recovery = 1;
@@ -1265,7 +1259,7 @@ static void check_and_start_recovery_timer(struct obd_device *obd,
                 spin_unlock_bh(&obd->obd_processing_task_lock);
                 return;
         }
-        CDEBUG(D_HA, "%s: starting recovery timer\n", obd->obd_name);
+        CWARN("%s: starting recovery timer\n", obd->obd_name);
         obd->obd_recovery_start = cfs_time_current_sec();
         /* minimum */
         obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout;
index fc1ef33..e436249 100644 (file)
@@ -2157,14 +2157,17 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
         if (obd->obd_recovering) {
                 LCONSOLE_WARN("MDT %s now serving %s (%s%s%s), but will be in "
                               "recovery for at least %d:%.02d, or until %d "
-                              "client%s reconnect%s.\n",
+                              "client%s reconnect. During this time new clients"
+                              " will not be allowed to connect. "
+                              "Recovery progress can be monitored by watching "
+                              "/proc/fs/lustre/mds/%s/recovery_status.\n",
                               obd->obd_name, lustre_cfg_string(lcfg, 1),
                               label ?: "", label ? "/" : "", str,
                               obd->obd_recovery_timeout / 60,
                               obd->obd_recovery_timeout % 60,
                               obd->obd_max_recoverable_clients,
                               (obd->obd_max_recoverable_clients == 1) ? "":"s",
-                              (obd->obd_max_recoverable_clients == 1) ? "s":"");
+                              obd->obd_name);
         } else {
                 LCONSOLE_INFO("MDT %s now serving %s (%s%s%s) with recovery "
                               "%s\n", obd->obd_name, lustre_cfg_string(lcfg, 1),
index 100be31..8118130 100644 (file)
@@ -857,7 +857,7 @@ int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
         }
 
         case OBD_IOC_ABORT_RECOVERY:
-                LCONSOLE_WARN("%s: Aborting recovery.\n", obd->obd_name);
+                CERROR("aborting recovery for device %s\n", obd->obd_name);
                 target_abort_recovery(obd);
                 /* obd_recovering has been changed */
                 mds_allow_cli(obd, 0);
index 98aef23..bc12c28 100644 (file)
@@ -1044,24 +1044,23 @@ void class_disconnect_stale_exports(struct obd_device *obd,
         struct list_head work_list;
         struct list_head *pos, *n;
         struct obd_export *exp;
+        int cnt = 0;
         ENTRY;
 
         CFS_INIT_LIST_HEAD(&work_list);
         spin_lock(&obd->obd_dev_lock);
-        obd->obd_stale_clients = 0;
         list_for_each_safe(pos, n, &obd->obd_exports) {
                 exp = list_entry(pos, struct obd_export, exp_obd_chain);
                 if (exp->exp_replay_needed) {
                         list_del(&exp->exp_obd_chain);
                         list_add(&exp->exp_obd_chain, &work_list);
-                        obd->obd_stale_clients++;
+                        cnt++;
                 }
         }
         spin_unlock(&obd->obd_dev_lock);
 
-        CDEBUG(D_HA, "%s: disconnecting %d stale clients\n", obd->obd_name,
-               obd->obd_stale_clients);
-
+        CDEBUG(D_ERROR, "%s: disconnecting %d stale clients\n",
+               obd->obd_name, cnt);
         class_disconnect_export_list(&work_list, flags);
         EXIT;
 }
index d9a347b..a74f29b 100644 (file)
@@ -1854,14 +1854,17 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf,
         if (obd->obd_recovering) {
                 LCONSOLE_WARN("OST %s now serving %s (%s%s%s), but will be in "
                               "recovery for at least %d:%.02d, or until %d "
-                              "client%s reconnect%s.\n",
+                              "client%s reconnect. During this time new clients"
+                              " will not be allowed to connect. "
+                              "Recovery progress can be monitored by watching "
+                              "/proc/fs/lustre/obdfilter/%s/recovery_status.\n",
                               obd->obd_name, lustre_cfg_string(lcfg, 1),
                               label ?: "", label ? "/" : "", str,
                               obd->obd_recovery_timeout / 60,
                               obd->obd_recovery_timeout % 60,
                               obd->obd_max_recoverable_clients,
-                              obd->obd_max_recoverable_clients == 1 ? "":"s",
-                              obd->obd_max_recoverable_clients == 1 ? "s":"");
+                              (obd->obd_max_recoverable_clients == 1) ? "":"s",
+                              obd->obd_name);
         } else {
                 LCONSOLE_INFO("OST %s now serving %s (%s%s%s) with recovery "
                               "%s\n", obd->obd_name, lustre_cfg_string(lcfg, 1),
@@ -2859,7 +2862,7 @@ static int filter_destroy_precreated(struct obd_export *exp, struct obdo *oa,
         last = filter_last_id(filter, doa.o_gr);
         skip_orphan = !!(exp->exp_connect_flags & OBD_CONNECT_SKIP_ORPHAN);
 
-        CDEBUG(D_HA, "%s: deleting orphan objects from "LPU64" to "LPU64"%s\n",
+        CWARN("%s: deleting orphan objects from "LPU64" to "LPU64"%s\n",
                exp->exp_obd->obd_name, oa->o_id + 1, last,
                skip_orphan ? ", orphan objids won't be reused any more." : ".");
 
index d1b0c8a..7489cc5 100644 (file)
@@ -374,8 +374,7 @@ int osc_create(struct obd_export *exp, struct obdo *oa,
                         oscc->oscc_last_id = oa->o_id;
                         ocd = &imp->imp_connect_data;
                         if (ocd->ocd_connect_flags & OBD_CONNECT_SKIP_ORPHAN) {
-                                CDEBUG(D_HA, "%s: Skip orphan set, reset last "
-                                       "objid\n", oscc->oscc_obd->obd_name);
+                                CWARN("Skip orphan set, reset last objid\n");
                                 oscc->oscc_next_id = oa->o_id + 1;
                         }
 
index 683bf4c..42042cb 100644 (file)
@@ -848,58 +848,30 @@ static int ptlrpc_check_reply(struct ptlrpc_request *req)
         return rc;
 }
 
-/* Conditionally suppress specific console messages */
-static int ptlrpc_console_allow(struct ptlrpc_request *req)
-{
-        __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
-        int err;
-
-        /* Suppress particular reconnect errors which are to be expected.  No
-         * errors are suppressed for the initial connection on an import */
-        if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) &&
-            (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) {
-
-                /* Suppress timed out reconnect requests */
-                if (req->rq_timedout)
-                        return 0;
-
-                /* Suppress unavailable/again reconnect requests */
-                err = lustre_msg_get_status(req->rq_repmsg);
-                if (err == -ENODEV || err == -EAGAIN)
-                        return 0;
-        }
-
-        return 1;
-}
-
 static int ptlrpc_check_status(struct ptlrpc_request *req)
 {
         int err;
         ENTRY;
 
         err = lustre_msg_get_status(req->rq_repmsg);
-        if (err < 0) {
-                DEBUG_REQ(D_INFO, req, "status is %d", err);
-        } else if (err > 0) {
-                /* XXX: translate this error from net to host */
-                DEBUG_REQ(D_INFO, req, "status is %d", err);
-        }
-
         if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
                 struct obd_import *imp = req->rq_import;
                 __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
 
-                if (ptlrpc_console_allow(req))
-                        LCONSOLE_ERROR_MSG(0x011,"an error occurred while "
-                                           "communicating with %s. The %s "
-                                           "operation failed with %d\n",
-                                           libcfs_nid2str(
-                                           imp->imp_connection->c_peer.nid),
-                                           ll_opcode2str(opc), err);
-
+                LCONSOLE_ERROR_MSG(0x011,"an error occurred while communicating"
+                               " with %s. The %s operation failed with %d\n",
+                               libcfs_nid2str(imp->imp_connection->c_peer.nid),
+                               ll_opcode2str(opc), err);
                 RETURN(err < 0 ? err : -EINVAL);
         }
 
+        if (err < 0) {
+                DEBUG_REQ(D_INFO, req, "status is %d", err);
+        } else if (err > 0) {
+                /* XXX: translate this error from net to host */
+                DEBUG_REQ(D_INFO, req, "status is %d", err);
+        }
+
         RETURN(err);
 }
 
@@ -1340,11 +1312,7 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
                   req->rq_net_err ? "network error" : "timeout",
                   (long)req->rq_sent, cfs_time_current_sec() - req->rq_sent);
 
-        spin_lock(&req->rq_lock);
-        req->rq_timedout = 1;
-        spin_unlock(&req->rq_lock);
-
-        if (imp && ptlrpc_console_allow(req)) {
+        if (imp) {
                 LCONSOLE_WARN("Request x"LPU64" sent from %s to NID %s %lus ago"
                               " has timed out (limit %lus).\n", req->rq_xid,
                               req->rq_import->imp_obd->obd_name,
@@ -1356,6 +1324,10 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
         if (imp != NULL && obd_debug_peer_on_timeout)
                 LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer);
 
+        spin_lock(&req->rq_lock);
+        req->rq_timedout = 1;
+        spin_unlock(&req->rq_lock);
+
         ptlrpc_unregister_reply(req, async_unlink);
         ptlrpc_unregister_bulk(req, async_unlink);
 
index fd00220..5e6d08a 100644 (file)
@@ -512,9 +512,10 @@ static int import_select_connection(struct obd_import *imp)
 
         if (imp->imp_conn_current != imp_conn) {
                 if (imp->imp_conn_current)
-                        CDEBUG(D_HA, "Changing connection for %s to %s/%s\n",
-                               imp->imp_obd->obd_name, imp_conn->oic_uuid.uuid,
-                               libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
+                        LCONSOLE_INFO("Changing connection for %s to %s/%s\n",
+                                      imp->imp_obd->obd_name,
+                                      imp_conn->oic_uuid.uuid,
+                                      libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
                 imp->imp_conn_current = imp_conn;
         }