int obd_max_recoverable_clients;
int obd_connected_clients;
int obd_recoverable_clients;
+ int obd_stale_clients;
int obd_delayed_clients;
spinlock_t obd_processing_task_lock; /* BH lock (timer) */
__u64 obd_next_recovery_transno;
void class_fail_export(struct obd_export *exp);
void class_disconnect_exports(struct obd_device *obddev);
int class_manual_cleanup(struct obd_device *obd);
-int class_disconnect_stale_exports(struct obd_device *,
- int (*test_export)(struct obd_export *),
- enum obd_option flags);
+void class_disconnect_stale_exports(struct obd_device *,
+ int (*test_export)(struct obd_export *),
+ enum obd_option flags);
static inline enum obd_option exp_flags_from_obd(struct obd_device *obd)
{
/* when recovery finished, cleanup orphans on mds and ost */
if (OBT(obd) && OBP(obd, postrecov)) {
int rc = OBP(obd, postrecov)(obd);
- LCONSOLE_WARN("%s: recovery %s: rc %d\n", obd->obd_name,
- rc < 0 ? "failed" : "complete", rc);
+ if (rc < 0)
+ LCONSOLE_WARN("%s: Post recovery failed, rc %d\n",
+ obd->obd_name, rc);
}
obd->obd_recovery_end = cfs_time_current_sec();
spin_unlock_bh(&obd->obd_processing_task_lock);
return;
}
- CWARN("%s: starting recovery timer\n", obd->obd_name);
+ CDEBUG(D_HA, "%s: starting recovery timer\n", obd->obd_name);
obd->obd_recovery_start = cfs_time_current_sec();
/* minimum */
obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout;
/* If some clients haven't replayed requests in time, evict them */
if (obd->obd_abort_recovery) {
- int stale;
CERROR("lock replay is aborted\n");
- stale = class_disconnect_stale_exports(obd, lock_replay_done,
- exp_flags_from_obd(obd) |
- OBD_OPT_ABORT_RECOV);
+ class_disconnect_stale_exports(obd, lock_replay_done,
+ exp_flags_from_obd(obd) |
+ OBD_OPT_ABORT_RECOV);
abort_lock_replay_queue(obd);
}
LASSERT(list_empty(&obd->obd_lock_replay_queue));
static void target_recovery_expired(unsigned long castmeharder)
{
struct obd_device *obd = (struct obd_device *)castmeharder;
- LCONSOLE_WARN("%s: recovery timed out; %d clients never reconnected "
- "after %lds (%d clients did)\n",
- obd->obd_name, obd->obd_recoverable_clients,
- cfs_time_current_sec()- obd->obd_recovery_start,
- obd->obd_connected_clients);
+ CDEBUG(D_HA, "%s: recovery timed out; %d clients never reconnected "
+ "after %lds (%d clients did)\n",
+ obd->obd_name, obd->obd_recoverable_clients,
+ cfs_time_current_sec()- obd->obd_recovery_start,
+ obd->obd_connected_clients);
spin_lock_bh(&obd->obd_processing_task_lock);
obd->obd_version_recov = 1;
/* Remove exports that have not completed recovery.
*/
-int class_disconnect_stale_exports(struct obd_device *obd,
- int (*test_export)(struct obd_export *),
- enum obd_option flags)
+void class_disconnect_stale_exports(struct obd_device *obd,
+ int (*test_export)(struct obd_export *),
+ enum obd_option flags)
{
struct list_head work_list;
struct list_head *pos, *n;
struct obd_export *exp;
- int cnt = 0;
ENTRY;
CFS_INIT_LIST_HEAD(&work_list);
spin_lock(&obd->obd_dev_lock);
+ obd->obd_stale_clients = 0;
list_for_each_safe(pos, n, &obd->obd_exports) {
exp = list_entry(pos, struct obd_export, exp_obd_chain);
if (test_export(exp))
&exp->exp_obd->obd_uuid))
continue;
- cnt++;
+ obd->obd_stale_clients++;
CDEBUG(D_ERROR, "%s: disconnect stale client %s@%s\n",
obd->obd_name, exp->exp_client_uuid.uuid,
exp->exp_connection == NULL ? "<unknown>" :
}
spin_unlock(&obd->obd_dev_lock);
- CDEBUG(D_ERROR, "%s: disconnecting %d stale clients\n",
- obd->obd_name, cnt);
+ CDEBUG(D_HA, "%s: disconnecting %d stale clients\n", obd->obd_name,
+ obd->obd_stale_clients);
+
class_disconnect_export_list(&work_list, flags);
- RETURN(cnt);
+ EXIT;
}
EXPORT_SYMBOL(class_disconnect_stale_exports);
if (obd->obd_recovering) {
LCONSOLE_WARN("OST %s now serving %s (%s%s%s), but will be in "
"recovery for at least %d:%.02d, or until %d "
- "client%s reconnect. During this time new clients"
- " will not be allowed to connect. "
- "Recovery progress can be monitored by watching "
- "/proc/fs/lustre/obdfilter/%s/recovery_status.\n",
+ "client%s reconnect%s.\n",
obd->obd_name, lustre_cfg_string(lcfg, 1),
label ?: "", label ? "/" : "", str,
obd->obd_recovery_timeout / 60,
obd->obd_recovery_timeout % 60,
obd->obd_max_recoverable_clients,
(obd->obd_max_recoverable_clients == 1) ? "":"s",
- obd->obd_name);
+ (obd->obd_max_recoverable_clients == 1) ? "s":"");
} else {
LCONSOLE_INFO("OST %s now serving %s (%s%s%s) with recovery "
"%s\n", obd->obd_name, lustre_cfg_string(lcfg, 1),
skip_orphan = !!(exp->exp_connect_flags & OBD_CONNECT_SKIP_ORPHAN);
- CWARN("%s: deleting orphan objects from "LPU64" to "LPU64"%s\n",
+ CDEBUG(D_HA, "%s: deleting orphan objects from "LPU64" to "LPU64"%s\n",
exp->exp_obd->obd_name, oa->o_id + 1, last,
skip_orphan ? ", orphan objids won't be reused any more." : ".");
switch (cmd) {
case OBD_IOC_ABORT_RECOVERY: {
- CERROR("aborting recovery for device %s\n", obd->obd_name);
+ LCONSOLE_WARN("%s: Aborting recovery.\n", obd->obd_name);
target_stop_recovery_thread(obd);
RETURN(0);
}
oscc->oscc_last_id = oa->o_id;
ocd = &imp->imp_connect_data;
if (ocd->ocd_connect_flags & OBD_CONNECT_SKIP_ORPHAN) {
- CWARN("Skip orphan set, reset the last objid\n");
+ CDEBUG(D_HA, "%s: Skip orphan set, reset last "
+ "objid\n", oscc->oscc_obd->obd_name);
oscc->oscc_next_id = oa->o_id + 1;
}
return rc;
}
+/* Conditionally suppress specific console messages */
+static int ptlrpc_console_allow(struct ptlrpc_request *req)
+{
+ __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+ int err;
+
+ /* Suppress particular reconnect errors which are to be expected. No
+ * errors are suppressed for the initial connection on an import */
+ if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) &&
+ (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) {
+
+ /* Suppress timed out reconnect requests */
+ if (req->rq_timedout)
+ return 0;
+
+ /* Suppress unavailable/again reconnect requests */
+ err = lustre_msg_get_status(req->rq_repmsg);
+ if (err == -ENODEV || err == -EAGAIN)
+ return 0;
+ }
+
+ return 1;
+}
+
+
static int ptlrpc_check_status(struct ptlrpc_request *req)
{
int err;
DEBUG_REQ(D_INFO, req, "status is %d", err);
}
+ if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
+ struct obd_import *imp = req->rq_import;
+ __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+ if (ptlrpc_console_allow(req))
+ LCONSOLE_ERROR_MSG(0x011,"an error occurred while "
+ "communicating with %s. The %s "
+ "operation failed with %d\n",
+ libcfs_nid2str(
+ imp->imp_connection->c_peer.nid),
+ ll_opcode2str(opc), err);
+
+ RETURN(err < 0 ? err : -EINVAL);
+ }
+
RETURN(err);
}
int rc = 0;
ENTRY;
+ spin_lock(&req->rq_lock);
+ req->rq_timedout = 1;
+ spin_unlock(&req->rq_lock);
+
DEBUG_REQ(D_WARNING, req, "Request x"LPU64" sent from %s to NID %s "
CFS_DURATION_T"s ago has %s (limit "CFS_DURATION_T"s).\n",
req->rq_xid, imp ? imp->imp_obd->obd_name : "<?>",
if (imp != NULL && obd_debug_peer_on_timeout)
LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer);
- spin_lock(&req->rq_lock);
- req->rq_timedout = 1;
- spin_unlock(&req->rq_lock);
-
ptlrpc_unregister_reply(req, async_unlink);
ptlrpc_unregister_bulk(req, async_unlink);
if (imp->imp_conn_current != imp_conn) {
if (imp->imp_conn_current)
- LCONSOLE_INFO("Changing connection for %s to %s/%s\n",
- imp->imp_obd->obd_name,
- imp_conn->oic_uuid.uuid,
- libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
+ CDEBUG(D_HA, "Changing connection for %s to %s/%s\n",
+ imp->imp_obd->obd_name, imp_conn->oic_uuid.uuid,
+ libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
imp->imp_conn_current = imp_conn;
}