int obd_max_recoverable_clients;
int obd_connected_clients;
int obd_recoverable_clients;
- int obd_stale_clients;
spinlock_t obd_processing_task_lock; /* BH lock (timer) */
pid_t obd_processing_task;
/* thread to handle recovery queue */
static void target_finish_recovery(struct obd_device *obd)
{
- int max_clients = obd->obd_max_recoverable_clients;
struct ptlrpc_request *req, *tmp;
- time_t elapsed_time = max_t(time_t, 1, cfs_time_current_sec() -
- obd->obd_recovery_start);
-
- LCONSOLE_INFO("%s: Recovery period over after %d:%.02d, of %d clients "
- "%d recovered and %d %s evicted.\n", obd->obd_name,
- (int)elapsed_time/60, (int)elapsed_time%60, max_clients,
- max_clients - obd->obd_recoverable_clients,
- obd->obd_stale_clients,
- obd->obd_stale_clients == 1 ? "was" : "were");
ldlm_reprocess_all_ns(obd->obd_namespace);
spin_lock_bh(&obd->obd_processing_task_lock);
/* when recovery finished, cleanup orphans on mds and ost */
if (OBT(obd) && OBP(obd, postrecov)) {
int rc = OBP(obd, postrecov)(obd);
- if (rc < 0)
- LCONSOLE_WARN("%s: Post recovery failed, rc %d\n",
- obd->obd_name, rc);
+ LCONSOLE_WARN("%s: recovery %s: rc %d\n", obd->obd_name,
+ rc < 0 ? "failed" : "complete", rc);
}
LCONSOLE_INFO("%s: sending delayed replies to recovered clients\n",
target_cancel_recovery_timer(obd);
spin_unlock_bh(&obd->obd_processing_task_lock);
+ LCONSOLE_WARN("%s: recovery period over; %d clients never reconnected "
+ "after %lds (%d clients did)\n",
+ obd->obd_name, obd->obd_recoverable_clients,
+ cfs_time_current_sec()- obd->obd_recovery_start,
+ obd->obd_connected_clients);
class_disconnect_stale_exports(obd, flags);
abort_recovery_queue(obd);
static void target_recovery_expired(unsigned long castmeharder)
{
struct obd_device *obd = (struct obd_device *)castmeharder;
- CDEBUG(D_HA, "%s: recovery timed out, aborting\n", obd->obd_name);
+ CERROR("%s: recovery timed out, aborting\n", obd->obd_name);
spin_lock_bh(&obd->obd_processing_task_lock);
if (obd->obd_recovering)
obd->obd_abort_recovery = 1;
spin_unlock_bh(&obd->obd_processing_task_lock);
return;
}
- CDEBUG(D_HA, "%s: starting recovery timer\n", obd->obd_name);
+ CWARN("%s: starting recovery timer\n", obd->obd_name);
obd->obd_recovery_start = cfs_time_current_sec();
/* minimum */
obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout;
if (obd->obd_recovering) {
LCONSOLE_WARN("MDT %s now serving %s (%s%s%s), but will be in "
"recovery for at least %d:%.02d, or until %d "
- "client%s reconnect%s.\n",
+ "client%s reconnect. During this time new clients"
+ " will not be allowed to connect. "
+ "Recovery progress can be monitored by watching "
+ "/proc/fs/lustre/mds/%s/recovery_status.\n",
obd->obd_name, lustre_cfg_string(lcfg, 1),
label ?: "", label ? "/" : "", str,
obd->obd_recovery_timeout / 60,
obd->obd_recovery_timeout % 60,
obd->obd_max_recoverable_clients,
(obd->obd_max_recoverable_clients == 1) ? "":"s",
- (obd->obd_max_recoverable_clients == 1) ? "s":"");
+ obd->obd_name);
} else {
LCONSOLE_INFO("MDT %s now serving %s (%s%s%s) with recovery "
"%s\n", obd->obd_name, lustre_cfg_string(lcfg, 1),
}
case OBD_IOC_ABORT_RECOVERY:
- LCONSOLE_WARN("%s: Aborting recovery.\n", obd->obd_name);
+ CERROR("aborting recovery for device %s\n", obd->obd_name);
target_abort_recovery(obd);
/* obd_recovering has been changed */
mds_allow_cli(obd, 0);
struct list_head work_list;
struct list_head *pos, *n;
struct obd_export *exp;
+ int cnt = 0;
ENTRY;
CFS_INIT_LIST_HEAD(&work_list);
spin_lock(&obd->obd_dev_lock);
- obd->obd_stale_clients = 0;
list_for_each_safe(pos, n, &obd->obd_exports) {
exp = list_entry(pos, struct obd_export, exp_obd_chain);
if (exp->exp_replay_needed) {
list_del(&exp->exp_obd_chain);
list_add(&exp->exp_obd_chain, &work_list);
- obd->obd_stale_clients++;
+ cnt++;
}
}
spin_unlock(&obd->obd_dev_lock);
- CDEBUG(D_HA, "%s: disconnecting %d stale clients\n", obd->obd_name,
- obd->obd_stale_clients);
-
+ CDEBUG(D_ERROR, "%s: disconnecting %d stale clients\n",
+ obd->obd_name, cnt);
class_disconnect_export_list(&work_list, flags);
EXIT;
}
if (obd->obd_recovering) {
LCONSOLE_WARN("OST %s now serving %s (%s%s%s), but will be in "
"recovery for at least %d:%.02d, or until %d "
- "client%s reconnect%s.\n",
+ "client%s reconnect. During this time new clients"
+ " will not be allowed to connect. "
+ "Recovery progress can be monitored by watching "
+ "/proc/fs/lustre/obdfilter/%s/recovery_status.\n",
obd->obd_name, lustre_cfg_string(lcfg, 1),
label ?: "", label ? "/" : "", str,
obd->obd_recovery_timeout / 60,
obd->obd_recovery_timeout % 60,
obd->obd_max_recoverable_clients,
- obd->obd_max_recoverable_clients == 1 ? "":"s",
- obd->obd_max_recoverable_clients == 1 ? "s":"");
+ (obd->obd_max_recoverable_clients == 1) ? "":"s",
+ obd->obd_name);
} else {
LCONSOLE_INFO("OST %s now serving %s (%s%s%s) with recovery "
"%s\n", obd->obd_name, lustre_cfg_string(lcfg, 1),
last = filter_last_id(filter, doa.o_gr);
skip_orphan = !!(exp->exp_connect_flags & OBD_CONNECT_SKIP_ORPHAN);
- CDEBUG(D_HA, "%s: deleting orphan objects from "LPU64" to "LPU64"%s\n",
+ CWARN("%s: deleting orphan objects from "LPU64" to "LPU64"%s\n",
exp->exp_obd->obd_name, oa->o_id + 1, last,
skip_orphan ? ", orphan objids won't be reused any more." : ".");
oscc->oscc_last_id = oa->o_id;
ocd = &imp->imp_connect_data;
if (ocd->ocd_connect_flags & OBD_CONNECT_SKIP_ORPHAN) {
- CDEBUG(D_HA, "%s: Skip orphan set, reset last "
- "objid\n", oscc->oscc_obd->obd_name);
+ CWARN("Skip orphan set, reset last objid\n");
oscc->oscc_next_id = oa->o_id + 1;
}
return rc;
}
-/* Conditionally suppress specific console messages */
-static int ptlrpc_console_allow(struct ptlrpc_request *req)
-{
- __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
- int err;
-
- /* Suppress particular reconnect errors which are to be expected. No
- * errors are suppressed for the initial connection on an import */
- if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) &&
- (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) {
-
- /* Suppress timed out reconnect requests */
- if (req->rq_timedout)
- return 0;
-
- /* Suppress unavailable/again reconnect requests */
- err = lustre_msg_get_status(req->rq_repmsg);
- if (err == -ENODEV || err == -EAGAIN)
- return 0;
- }
-
- return 1;
-}
-
static int ptlrpc_check_status(struct ptlrpc_request *req)
{
int err;
ENTRY;
err = lustre_msg_get_status(req->rq_repmsg);
- if (err < 0) {
- DEBUG_REQ(D_INFO, req, "status is %d", err);
- } else if (err > 0) {
- /* XXX: translate this error from net to host */
- DEBUG_REQ(D_INFO, req, "status is %d", err);
- }
-
if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
struct obd_import *imp = req->rq_import;
__u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
- if (ptlrpc_console_allow(req))
- LCONSOLE_ERROR_MSG(0x011,"an error occurred while "
- "communicating with %s. The %s "
- "operation failed with %d\n",
- libcfs_nid2str(
- imp->imp_connection->c_peer.nid),
- ll_opcode2str(opc), err);
-
+ LCONSOLE_ERROR_MSG(0x011,"an error occurred while communicating"
+ " with %s. The %s operation failed with %d\n",
+ libcfs_nid2str(imp->imp_connection->c_peer.nid),
+ ll_opcode2str(opc), err);
RETURN(err < 0 ? err : -EINVAL);
}
+ if (err < 0) {
+ DEBUG_REQ(D_INFO, req, "status is %d", err);
+ } else if (err > 0) {
+ /* XXX: translate this error from net to host */
+ DEBUG_REQ(D_INFO, req, "status is %d", err);
+ }
+
RETURN(err);
}
req->rq_net_err ? "network error" : "timeout",
(long)req->rq_sent, cfs_time_current_sec() - req->rq_sent);
- spin_lock(&req->rq_lock);
- req->rq_timedout = 1;
- spin_unlock(&req->rq_lock);
-
- if (imp && ptlrpc_console_allow(req)) {
+ if (imp) {
LCONSOLE_WARN("Request x"LPU64" sent from %s to NID %s %lus ago"
" has timed out (limit %lus).\n", req->rq_xid,
req->rq_import->imp_obd->obd_name,
if (imp != NULL && obd_debug_peer_on_timeout)
LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer);
+ spin_lock(&req->rq_lock);
+ req->rq_timedout = 1;
+ spin_unlock(&req->rq_lock);
+
ptlrpc_unregister_reply(req, async_unlink);
ptlrpc_unregister_bulk(req, async_unlink);
if (imp->imp_conn_current != imp_conn) {
if (imp->imp_conn_current)
- CDEBUG(D_HA, "Changing connection for %s to %s/%s\n",
- imp->imp_obd->obd_name, imp_conn->oic_uuid.uuid,
- libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
+ LCONSOLE_INFO("Changing connection for %s to %s/%s\n",
+ imp->imp_obd->obd_name,
+ imp_conn->oic_uuid.uuid,
+ libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
imp->imp_conn_current = imp_conn;
}