From 7c4ce177ed0ca1103f8944f635896f0dad405e44 Mon Sep 17 00:00:00 2001 From: anserper Date: Tue, 5 May 2009 13:52:56 +0000 Subject: [PATCH] b=18948 ; revert the commit --- lustre/include/obd.h | 1 - lustre/ldlm/ldlm_lib.c | 24 +++++++------------ lustre/mds/handler.c | 7 ++++-- lustre/mds/mds_lov.c | 2 +- lustre/obdclass/genops.c | 9 ++++--- lustre/obdfilter/filter.c | 11 +++++---- lustre/osc/osc_create.c | 3 +-- lustre/ptlrpc/client.c | 60 +++++++++++++---------------------------------- lustre/ptlrpc/import.c | 7 +++--- 9 files changed, 47 insertions(+), 77 deletions(-) diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 4dac00d..a8d10b4 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -895,7 +895,6 @@ struct obd_device { int obd_max_recoverable_clients; int obd_connected_clients; int obd_recoverable_clients; - int obd_stale_clients; spinlock_t obd_processing_task_lock; /* BH lock (timer) */ pid_t obd_processing_task; /* thread to handle recovery queue */ diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 6518523..0af549c 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -1055,17 +1055,7 @@ static void target_release_saved_req(struct ptlrpc_request *req) static void target_finish_recovery(struct obd_device *obd) { - int max_clients = obd->obd_max_recoverable_clients; struct ptlrpc_request *req, *tmp; - time_t elapsed_time = max_t(time_t, 1, cfs_time_current_sec() - - obd->obd_recovery_start); - - LCONSOLE_INFO("%s: Recovery period over after %d:%.02d, of %d clients " - "%d recovered and %d %s evicted.\n", obd->obd_name, - (int)elapsed_time/60, (int)elapsed_time%60, max_clients, - max_clients - obd->obd_recoverable_clients, - obd->obd_stale_clients, - obd->obd_stale_clients == 1 ? "was" : "were"); ldlm_reprocess_all_ns(obd->obd_namespace); spin_lock_bh(&obd->obd_processing_task_lock); @@ -1081,9 +1071,8 @@ static void target_finish_recovery(struct obd_device *obd) /* when recovery finished, cleanup orphans on mds and ost */ if (OBT(obd) && OBP(obd, postrecov)) { int rc = OBP(obd, postrecov)(obd); - if (rc < 0) - LCONSOLE_WARN("%s: Post recovery failed, rc %d\n", - obd->obd_name, rc); + LCONSOLE_WARN("%s: recovery %s: rc %d\n", obd->obd_name, + rc < 0 ? "failed" : "complete", rc); } LCONSOLE_INFO("%s: sending delayed replies to recovered clients\n", @@ -1189,6 +1178,11 @@ void target_abort_recovery(void *data) target_cancel_recovery_timer(obd); spin_unlock_bh(&obd->obd_processing_task_lock); + LCONSOLE_WARN("%s: recovery period over; %d clients never reconnected " + "after %lds (%d clients did)\n", + obd->obd_name, obd->obd_recoverable_clients, + cfs_time_current_sec()- obd->obd_recovery_start, + obd->obd_connected_clients); class_disconnect_stale_exports(obd, flags); abort_recovery_queue(obd); @@ -1200,7 +1194,7 @@ void target_abort_recovery(void *data) static void target_recovery_expired(unsigned long castmeharder) { struct obd_device *obd = (struct obd_device *)castmeharder; - CDEBUG(D_HA, "%s: recovery timed out, aborting\n", obd->obd_name); + CERROR("%s: recovery timed out, aborting\n", obd->obd_name); spin_lock_bh(&obd->obd_processing_task_lock); if (obd->obd_recovering) obd->obd_abort_recovery = 1; @@ -1265,7 +1259,7 @@ static void check_and_start_recovery_timer(struct obd_device *obd, spin_unlock_bh(&obd->obd_processing_task_lock); return; } - CDEBUG(D_HA, "%s: starting recovery timer\n", obd->obd_name); + CWARN("%s: starting recovery timer\n", obd->obd_name); obd->obd_recovery_start = cfs_time_current_sec(); /* minimum */ obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout; diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index fc1ef33..e436249 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -2157,14 +2157,17 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) if (obd->obd_recovering) { LCONSOLE_WARN("MDT %s now serving %s (%s%s%s), but will be in " "recovery for at least %d:%.02d, or until %d " - "client%s reconnect%s.\n", + "client%s reconnect. During this time new clients" + " will not be allowed to connect. " + "Recovery progress can be monitored by watching " + "/proc/fs/lustre/mds/%s/recovery_status.\n", obd->obd_name, lustre_cfg_string(lcfg, 1), label ?: "", label ? "/" : "", str, obd->obd_recovery_timeout / 60, obd->obd_recovery_timeout % 60, obd->obd_max_recoverable_clients, (obd->obd_max_recoverable_clients == 1) ? "":"s", - (obd->obd_max_recoverable_clients == 1) ? "s":""); + obd->obd_name); } else { LCONSOLE_INFO("MDT %s now serving %s (%s%s%s) with recovery " "%s\n", obd->obd_name, lustre_cfg_string(lcfg, 1), diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index 100be31..8118130 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -857,7 +857,7 @@ int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len, } case OBD_IOC_ABORT_RECOVERY: - LCONSOLE_WARN("%s: Aborting recovery.\n", obd->obd_name); + CERROR("aborting recovery for device %s\n", obd->obd_name); target_abort_recovery(obd); /* obd_recovering has been changed */ mds_allow_cli(obd, 0); diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index 98aef23..bc12c28 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -1044,24 +1044,23 @@ void class_disconnect_stale_exports(struct obd_device *obd, struct list_head work_list; struct list_head *pos, *n; struct obd_export *exp; + int cnt = 0; ENTRY; CFS_INIT_LIST_HEAD(&work_list); spin_lock(&obd->obd_dev_lock); - obd->obd_stale_clients = 0; list_for_each_safe(pos, n, &obd->obd_exports) { exp = list_entry(pos, struct obd_export, exp_obd_chain); if (exp->exp_replay_needed) { list_del(&exp->exp_obd_chain); list_add(&exp->exp_obd_chain, &work_list); - obd->obd_stale_clients++; + cnt++; } } spin_unlock(&obd->obd_dev_lock); - CDEBUG(D_HA, "%s: disconnecting %d stale clients\n", obd->obd_name, - obd->obd_stale_clients); - + CDEBUG(D_ERROR, "%s: disconnecting %d stale clients\n", + obd->obd_name, cnt); class_disconnect_export_list(&work_list, flags); EXIT; } diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index d9a347b..a74f29b 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -1854,14 +1854,17 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf, if (obd->obd_recovering) { LCONSOLE_WARN("OST %s now serving %s (%s%s%s), but will be in " "recovery for at least %d:%.02d, or until %d " - "client%s reconnect%s.\n", + "client%s reconnect. During this time new clients" + " will not be allowed to connect. " + "Recovery progress can be monitored by watching " + "/proc/fs/lustre/obdfilter/%s/recovery_status.\n", obd->obd_name, lustre_cfg_string(lcfg, 1), label ?: "", label ? "/" : "", str, obd->obd_recovery_timeout / 60, obd->obd_recovery_timeout % 60, obd->obd_max_recoverable_clients, - obd->obd_max_recoverable_clients == 1 ? "":"s", - obd->obd_max_recoverable_clients == 1 ? "s":""); + (obd->obd_max_recoverable_clients == 1) ? "":"s", + obd->obd_name); } else { LCONSOLE_INFO("OST %s now serving %s (%s%s%s) with recovery " "%s\n", obd->obd_name, lustre_cfg_string(lcfg, 1), @@ -2859,7 +2862,7 @@ static int filter_destroy_precreated(struct obd_export *exp, struct obdo *oa, last = filter_last_id(filter, doa.o_gr); skip_orphan = !!(exp->exp_connect_flags & OBD_CONNECT_SKIP_ORPHAN); - CDEBUG(D_HA, "%s: deleting orphan objects from "LPU64" to "LPU64"%s\n", + CWARN("%s: deleting orphan objects from "LPU64" to "LPU64"%s\n", exp->exp_obd->obd_name, oa->o_id + 1, last, skip_orphan ? ", orphan objids won't be reused any more." : "."); diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c index d1b0c8a..7489cc5 100644 --- a/lustre/osc/osc_create.c +++ b/lustre/osc/osc_create.c @@ -374,8 +374,7 @@ int osc_create(struct obd_export *exp, struct obdo *oa, oscc->oscc_last_id = oa->o_id; ocd = &imp->imp_connect_data; if (ocd->ocd_connect_flags & OBD_CONNECT_SKIP_ORPHAN) { - CDEBUG(D_HA, "%s: Skip orphan set, reset last " - "objid\n", oscc->oscc_obd->obd_name); + CWARN("Skip orphan set, reset last objid\n"); oscc->oscc_next_id = oa->o_id + 1; } diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 683bf4c..42042cb 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -848,58 +848,30 @@ static int ptlrpc_check_reply(struct ptlrpc_request *req) return rc; } -/* Conditionally suppress specific console messages */ -static int ptlrpc_console_allow(struct ptlrpc_request *req) -{ - __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); - int err; - - /* Suppress particular reconnect errors which are to be expected. No - * errors are suppressed for the initial connection on an import */ - if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) && - (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) { - - /* Suppress timed out reconnect requests */ - if (req->rq_timedout) - return 0; - - /* Suppress unavailable/again reconnect requests */ - err = lustre_msg_get_status(req->rq_repmsg); - if (err == -ENODEV || err == -EAGAIN) - return 0; - } - - return 1; -} - static int ptlrpc_check_status(struct ptlrpc_request *req) { int err; ENTRY; err = lustre_msg_get_status(req->rq_repmsg); - if (err < 0) { - DEBUG_REQ(D_INFO, req, "status is %d", err); - } else if (err > 0) { - /* XXX: translate this error from net to host */ - DEBUG_REQ(D_INFO, req, "status is %d", err); - } - if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) { struct obd_import *imp = req->rq_import; __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); - if (ptlrpc_console_allow(req)) - LCONSOLE_ERROR_MSG(0x011,"an error occurred while " - "communicating with %s. The %s " - "operation failed with %d\n", - libcfs_nid2str( - imp->imp_connection->c_peer.nid), - ll_opcode2str(opc), err); - + LCONSOLE_ERROR_MSG(0x011,"an error occurred while communicating" + " with %s. The %s operation failed with %d\n", + libcfs_nid2str(imp->imp_connection->c_peer.nid), + ll_opcode2str(opc), err); RETURN(err < 0 ? err : -EINVAL); } + if (err < 0) { + DEBUG_REQ(D_INFO, req, "status is %d", err); + } else if (err > 0) { + /* XXX: translate this error from net to host */ + DEBUG_REQ(D_INFO, req, "status is %d", err); + } + RETURN(err); } @@ -1340,11 +1312,7 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink) req->rq_net_err ? "network error" : "timeout", (long)req->rq_sent, cfs_time_current_sec() - req->rq_sent); - spin_lock(&req->rq_lock); - req->rq_timedout = 1; - spin_unlock(&req->rq_lock); - - if (imp && ptlrpc_console_allow(req)) { + if (imp) { LCONSOLE_WARN("Request x"LPU64" sent from %s to NID %s %lus ago" " has timed out (limit %lus).\n", req->rq_xid, req->rq_import->imp_obd->obd_name, @@ -1356,6 +1324,10 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink) if (imp != NULL && obd_debug_peer_on_timeout) LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer); + spin_lock(&req->rq_lock); + req->rq_timedout = 1; + spin_unlock(&req->rq_lock); + ptlrpc_unregister_reply(req, async_unlink); ptlrpc_unregister_bulk(req, async_unlink); diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index fd00220..5e6d08a 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -512,9 +512,10 @@ static int import_select_connection(struct obd_import *imp) if (imp->imp_conn_current != imp_conn) { if (imp->imp_conn_current) - CDEBUG(D_HA, "Changing connection for %s to %s/%s\n", - imp->imp_obd->obd_name, imp_conn->oic_uuid.uuid, - libcfs_nid2str(imp_conn->oic_conn->c_peer.nid)); + LCONSOLE_INFO("Changing connection for %s to %s/%s\n", + imp->imp_obd->obd_name, + imp_conn->oic_uuid.uuid, + libcfs_nid2str(imp_conn->oic_conn->c_peer.nid)); imp->imp_conn_current = imp_conn; } -- 1.8.3.1