int obd_max_recoverable_clients;
int obd_connected_clients;
int obd_recoverable_clients;
+ int obd_stale_clients;
int obd_delayed_clients;
spinlock_t obd_processing_task_lock; /* BH lock (timer) */
pid_t obd_processing_task;
static void target_send_delayed_replies(struct obd_device *obd)
{
+ int max_clients = obd->obd_max_recoverable_clients;
struct ptlrpc_request *req, *tmp;
+ time_t elapsed_time = max_t(time_t, 1, cfs_time_current_sec() -
+ obd->obd_recovery_start);
+
+ LCONSOLE_INFO("%s: Recovery period over after %d:%.02d, of %d clients "
+ "%d recovered and %d %s evicted.\n", obd->obd_name,
+ (int)elapsed_time/60, (int)elapsed_time%60, max_clients,
+ max_clients - obd->obd_recoverable_clients,
+ obd->obd_stale_clients,
+ obd->obd_stale_clients == 1 ? "was" : "were");
LCONSOLE_INFO("%s: sending delayed replies to recovered clients\n",
obd->obd_name);
/* when recovery finished, cleanup orphans on mds and ost */
if (OBT(obd) && OBP(obd, postrecov)) {
int rc = OBP(obd, postrecov)(obd);
- LCONSOLE_WARN("%s: recovery %s: rc %d\n", obd->obd_name,
- rc < 0 ? "failed" : "complete", rc);
+ if (rc < 0)
+ LCONSOLE_WARN("%s: Post recovery failed, rc %d\n",
+ obd->obd_name, rc);
}
target_send_delayed_replies(obd);
}
static void target_recovery_expired(unsigned long castmeharder)
{
struct obd_device *obd = (struct obd_device *)castmeharder;
- LCONSOLE_WARN("%s: recovery period over; %d clients never reconnected "
- "after %lds (%d clients did)\n",
- obd->obd_name, obd->obd_recoverable_clients,
- cfs_time_current_sec()- obd->obd_recovery_start,
- obd->obd_connected_clients);
+ CDEBUG(D_HA, "%s: recovery period over; %d clients never reconnected "
+ "after %lds (%d clients did)\n", obd->obd_name,
+ obd->obd_recoverable_clients,
+ cfs_time_current_sec() - obd->obd_recovery_start,
+ obd->obd_connected_clients);
spin_lock_bh(&obd->obd_processing_task_lock);
obd->obd_abort_recovery = 1;
spin_unlock_bh(&obd->obd_processing_task_lock);
return;
}
- CWARN("%s: starting recovery timer\n", obd->obd_name);
+ CDEBUG(D_HA, "%s: starting recovery timer\n", obd->obd_name);
obd->obd_recovery_start = cfs_time_current_sec();
/* minimum */
obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout;
if (obd->obd_recovering) {
LCONSOLE_WARN("MDT %s now serving %s (%s%s%s), but will be in "
"recovery for at least %d:%.02d, or until %d "
- "client%s reconnect. During this time new clients"
- " will not be allowed to connect. "
- "Recovery progress can be monitored by watching "
- "/proc/fs/lustre/mds/%s/recovery_status.\n",
+ "client%s reconnect%s. \n",
obd->obd_name, lustre_cfg_string(lcfg, 1),
label ?: "", label ? "/" : "", str,
obd->obd_recovery_timeout / 60,
obd->obd_recovery_timeout % 60,
obd->obd_recoverable_clients,
(obd->obd_recoverable_clients == 1) ? "":"s",
- obd->obd_name);
+ (obd->obd_recoverable_clients == 1) ? "s":"");
} else {
LCONSOLE_INFO("MDT %s now serving %s (%s%s%s) with recovery "
"%s\n", obd->obd_name, lustre_cfg_string(lcfg, 1),
}
case OBD_IOC_ABORT_RECOVERY:
- CERROR("aborting recovery for device %s\n", obd->obd_name);
+ LCONSOLE_WARN("%s: Aborting recovery.\n", obd->obd_name);
target_abort_recovery(obd);
/* obd_recovering has been changed */
mds_allow_cli(obd, 0);
struct list_head work_list;
struct list_head *pos, *n;
struct obd_export *exp;
- int cnt = 0;
ENTRY;
CFS_INIT_LIST_HEAD(&work_list);
spin_lock(&obd->obd_dev_lock);
+ obd->obd_stale_clients = 0;
list_for_each_safe(pos, n, &obd->obd_exports) {
exp = list_entry(pos, struct obd_export, exp_obd_chain);
if (exp->exp_replay_needed) {
list_move(&exp->exp_obd_chain, &work_list);
- cnt++;
+ obd->obd_stale_clients++;
}
}
spin_unlock(&obd->obd_dev_lock);
- CDEBUG(D_ERROR, "%s: disconnecting %d stale clients\n",
- obd->obd_name, cnt);
+ CDEBUG(D_HA, "%s: disconnecting %d stale clients\n",
+ obd->obd_name, obd->obd_stale_clients);
class_disconnect_export_list(&work_list, flags);
EXIT;
}
if (obd->obd_recovering) {
LCONSOLE_WARN("OST %s now serving %s (%s%s%s), but will be in "
"recovery for at least %d:%.02d, or until %d "
- "client%s reconnect. During this time new clients"
- " will not be allowed to connect. "
- "Recovery progress can be monitored by watching "
- "/proc/fs/lustre/obdfilter/%s/recovery_status.\n",
+ "client%s reconnect%s.\n",
obd->obd_name, lustre_cfg_string(lcfg, 1),
label ?: "", label ? "/" : "", str,
obd->obd_recovery_timeout / 60,
obd->obd_recovery_timeout % 60,
obd->obd_recoverable_clients,
- (obd->obd_recoverable_clients == 1) ? "":"s",
- obd->obd_name);
+ obd->obd_recoverable_clients == 1 ? "":"s",
+ obd->obd_recoverable_clients == 1 ? "s":"");
} else {
LCONSOLE_INFO("OST %s now serving %s (%s%s%s) with recovery "
"%s\n", obd->obd_name, lustre_cfg_string(lcfg, 1),
oscc->oscc_last_id = oa->o_id;
ocd = &imp->imp_connect_data;
if (ocd->ocd_connect_flags & OBD_CONNECT_SKIP_ORPHAN) {
- CWARN("Skip orphan set, reset last objid\n");
+ CDEBUG(D_HA, "%s: Skip orphan set, reset last "
+ "objid\n", oscc->oscc_obd->obd_name);
oscc->oscc_next_id = oa->o_id + 1;
}
return rc;
}
-static int ptlrpc_check_status(struct ptlrpc_request *req)
+/* Conditionally suppress specific console messages */
+static int ptlrpc_console_allow(struct ptlrpc_request *req)
{
+ __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
int err;
- ENTRY;
- err = lustre_msg_get_status(req->rq_repmsg);
- if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
- struct obd_import *imp = req->rq_import;
- __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+ /* Suppress particular reconnect errors which are to be expected. No
+ * errors are suppressed for the initial connection on an import */
+ if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) &&
+ (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) {
- LCONSOLE_ERROR_MSG(0x011,"an error occurred while communicating"
- " with %s. The %s operation failed with %d\n",
- libcfs_nid2str(imp->imp_connection->c_peer.nid),
- ll_opcode2str(opc), err);
- RETURN(err < 0 ? err : -EINVAL);
+ /* Suppress timed out reconnect requests */
+ if (req->rq_timedout)
+ return 0;
+
+ /* Suppress unavailable/again reconnect requests */
+ err = lustre_msg_get_status(req->rq_repmsg);
+ if (err == -ENODEV || err == -EAGAIN)
+ return 0;
}
+ return 1;
+}
+
+static int ptlrpc_check_status(struct ptlrpc_request *req)
+{
+ int err;
+ ENTRY;
+
+ err = lustre_msg_get_status(req->rq_repmsg);
if (err < 0) {
DEBUG_REQ(D_INFO, req, "status is %d", err);
} else if (err > 0) {
DEBUG_REQ(D_INFO, req, "status is %d", err);
}
+ if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
+ struct obd_import *imp = req->rq_import;
+ __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+ if (ptlrpc_console_allow(req))
+ LCONSOLE_ERROR_MSG(0x011,"an error occurred while "
+ "communicating with %s. The %s "
+ "operation failed with %d\n",
+ libcfs_nid2str(
+ imp->imp_connection->c_peer.nid),
+ ll_opcode2str(opc), err);
+
+ RETURN(err < 0 ? err : -EINVAL);
+ }
+
RETURN(err);
}
if (imp->imp_conn_current != imp_conn) {
if (imp->imp_conn_current)
- LCONSOLE_INFO("Changing connection for %s to %s/%s\n",
- imp->imp_obd->obd_name,
- imp_conn->oic_uuid.uuid,
- libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
+ CDEBUG(D_HA, "Changing connection for %s to %s/%s\n",
+ imp->imp_obd->obd_name, imp_conn->oic_uuid.uuid,
+ libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
imp->imp_conn_current = imp_conn;
}