This patch introduces a new ptlrpc module parameter, "suppress_pings",
to provide an option for reducing excessive OBD_PINGs in large
clusters. The parameter affects all MDTs and OSTs on a node. It is
off (zero) by default, giving a behavior identical to current
implementation. If it is on (non-zero), all clients of the affected
targets who understand OBD_CONNECT_PINGLESS will know, at connect
time, that pings are not required. When suppressing pings, there must
be an external mechanism to notify the targets of client deaths, via
the targets' "evict_client" procfs entries. In addition, a highly
available standalone MGS is also recommended when suppressing pings,
so that clients are notified (through Imperative Recovery) of target
recoveries.
The changes do two basically independent things. One is initializing
import and export states (i.e., imp_connect_data and
exp_obd_chain_timed) according to "suppress_pings", since whether to
ping or not is a property of each import-export pair. MGC pings can
not be suppressed, because maintaining MGS connections is dictated by
the reliance on Imperative Recovery.
The other thing is changing pinger and import routines to respect the
import property set earlier. (The export side does not need any
change at all.) Pings are still needed to query last committed
transactions if there are uncommitted requests on an import, so that
resources pinned for replays can be released even when applications
become idle. An early version of this patch removes imports that do
not need to be pinged from pinger_imports and add them back when last
committed transactions are needed or recoveries must be initiated.
This version does not do that because a) the overheads of iterating
through 10,000 imports are not prohibitively large---around 10 ms, b)
adding imports back to pinger_imports requires the global pinger_mutex
to be held, and c) the imp_lock contention added on each import is
small.
Change-Id: Iabc84d395c978c3f156c52aebfad83621facb4fe
Signed-off-by: Li Wei <wei.g.li@intel.com>
Reviewed-on: http://review.whamcloud.com/5009
Reviewed-by: Lai Siyao <lai.siyao@intel.com>
Tested-by: Hudson
Reviewed-by: Emoly Liu <emoly.liu@intel.com>
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
12 files changed:
OBD_CONNECT_64BITHASH | OBD_CONNECT_JOBSTATS | \
OBD_CONNECT_EINPROGRESS | \
OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_UMASK | \
OBD_CONNECT_64BITHASH | OBD_CONNECT_JOBSTATS | \
OBD_CONNECT_EINPROGRESS | \
OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_UMASK | \
- OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LAYOUTLOCK)
+ OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LAYOUTLOCK |\
+ OBD_CONNECT_PINGLESS)
#define OST_CONNECT_SUPPORTED (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \
#define OST_CONNECT_SUPPORTED (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \
OBD_CONNECT_EINPROGRESS | \
OBD_CONNECT_JOBSTATS | \
OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_LVB_TYPE|\
OBD_CONNECT_EINPROGRESS | \
OBD_CONNECT_JOBSTATS | \
OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_LVB_TYPE|\
- OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_FID)
+ OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_FID | \
+ OBD_CONNECT_PINGLESS)
#define ECHO_CONNECT_SUPPORTED (0)
#define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION | OBD_CONNECT_AT | \
OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | \
#define ECHO_CONNECT_SUPPORTED (0)
#define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION | OBD_CONNECT_AT | \
OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | \
+ OBD_CONNECT_MNE_SWAB | OBD_CONNECT_PINGLESS)
/* Features required for this version of the client to work with server */
#define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_IBITS | OBD_CONNECT_FID | \
/* Features required for this version of the client to work with server */
#define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_IBITS | OBD_CONNECT_FID | \
imp_no_lock_replay:1, /* VBR: if gap was found then no lock replays */
imp_vbr_failed:1, /* recovery by versions was failed */
imp_force_verify:1, /* force an immidiate ping */
imp_no_lock_replay:1, /* VBR: if gap was found then no lock replays */
imp_vbr_failed:1, /* recovery by versions was failed */
imp_force_verify:1, /* force an immidiate ping */
+ imp_force_next_verify:1,/* force a scheduled ping */
imp_pingable:1, /* pingable */
imp_resend_replay:1, /* resend for replay */
imp_no_pinger_recover:1,/* disable normal recovery, for test only. */
imp_pingable:1, /* pingable */
imp_resend_replay:1, /* resend for replay */
imp_no_pinger_recover:1,/* disable normal recovery, for test only. */
* Pinger API (client side only)
* @{
*/
* Pinger API (client side only)
* @{
*/
+extern int suppress_pings;
enum timeout_event {
TIMEOUT_GRANT = 1
};
enum timeout_event {
TIMEOUT_GRANT = 1
};
GOTO(err_ldlm, rc = -ENOENT);
imp->imp_client = &obddev->obd_ldlm_client;
imp->imp_connect_op = connect_op;
GOTO(err_ldlm, rc = -ENOENT);
imp->imp_client = &obddev->obd_ldlm_client;
imp->imp_connect_op = connect_op;
- CFS_INIT_LIST_HEAD(&imp->imp_pinger_chain);
memcpy(cli->cl_target_uuid.uuid, lustre_cfg_buf(lcfg, 1),
LUSTRE_CFG_BUFLEN(lcfg, 1));
class_import_put(imp);
memcpy(cli->cl_target_uuid.uuid, lustre_cfg_buf(lcfg, 1),
LUSTRE_CFG_BUFLEN(lcfg, 1));
class_import_put(imp);
OBD_CONNECT_FULL20 | OBD_CONNECT_64BITHASH|
OBD_CONNECT_EINPROGRESS |
OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
OBD_CONNECT_FULL20 | OBD_CONNECT_64BITHASH|
OBD_CONNECT_EINPROGRESS |
OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
- OBD_CONNECT_LAYOUTLOCK;
+ OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS;
if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
data->ocd_connect_flags |= OBD_CONNECT_SOM;
if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
data->ocd_connect_flags |= OBD_CONNECT_SOM;
OBD_CONNECT_MAXBYTES |
OBD_CONNECT_EINPROGRESS |
OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
OBD_CONNECT_MAXBYTES |
OBD_CONNECT_EINPROGRESS |
OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
- OBD_CONNECT_LAYOUTLOCK;
+ OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS;
if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
data->ocd_connect_flags |= OBD_CONNECT_SOM;
if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
data->ocd_connect_flags |= OBD_CONNECT_SOM;
OBD_CONNECT_SKIP_ORPHAN |
OBD_CONNECT_FID |
OBD_CONNECT_LVB_TYPE |
OBD_CONNECT_SKIP_ORPHAN |
OBD_CONNECT_FID |
OBD_CONNECT_LVB_TYPE |
+ OBD_CONNECT_VERSION |
+ OBD_CONNECT_PINGLESS;
data->ocd_group = tgt_index;
ltd = &lod->lod_ost_descs;
data->ocd_group = tgt_index;
ltd = &lod->lod_ost_descs;
OBD_CONNECT_MDS_MDS |
OBD_CONNECT_FID |
OBD_CONNECT_AT |
OBD_CONNECT_MDS_MDS |
OBD_CONNECT_FID |
OBD_CONNECT_AT |
+ OBD_CONNECT_FULL20 |
+ OBD_CONNECT_PINGLESS;
/* XXX set MDS-MDS flags, remove this when running this
* on client*/
data->ocd_connect_flags |= OBD_CONNECT_MDS_MDS;
/* XXX set MDS-MDS flags, remove this when running this
* on client*/
data->ocd_connect_flags |= OBD_CONNECT_MDS_MDS;
+ if (data->ocd_connect_flags & OBD_CONNECT_PINGLESS) {
+ if (suppress_pings) {
+ spin_lock(&exp->exp_obd->obd_dev_lock);
+ list_del_init(&exp->exp_obd_chain_timed);
+ spin_unlock(&exp->exp_obd->obd_dev_lock);
+ } else {
+ data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
+ }
+ }
+
if (imp == NULL)
return NULL;
if (imp == NULL)
return NULL;
+ CFS_INIT_LIST_HEAD(&imp->imp_pinger_chain);
CFS_INIT_LIST_HEAD(&imp->imp_zombie_chain);
CFS_INIT_LIST_HEAD(&imp->imp_replay_list);
CFS_INIT_LIST_HEAD(&imp->imp_sending_list);
CFS_INIT_LIST_HEAD(&imp->imp_zombie_chain);
CFS_INIT_LIST_HEAD(&imp->imp_replay_list);
CFS_INIT_LIST_HEAD(&imp->imp_sending_list);
data->ocd_connect_flags |= OBD_CONNECT_MDS_MDS | OBD_CONNECT_FID |
OBD_CONNECT_AT | OBD_CONNECT_LRU_RESIZE |
OBD_CONNECT_FULL20 | OBD_CONNECT_LVB_TYPE |
data->ocd_connect_flags |= OBD_CONNECT_MDS_MDS | OBD_CONNECT_FID |
OBD_CONNECT_AT | OBD_CONNECT_LRU_RESIZE |
OBD_CONNECT_FULL20 | OBD_CONNECT_LVB_TYPE |
- OBD_CONNECT_LIGHTWEIGHT;
+ OBD_CONNECT_LIGHTWEIGHT |
+ OBD_CONNECT_PINGLESS;
OBD_ALLOC_PTR(uuid);
if (uuid == NULL)
GOTO(out, rc = -ENOMEM);
OBD_ALLOC_PTR(uuid);
if (uuid == NULL)
GOTO(out, rc = -ENOMEM);
if (data->ocd_connect_flags & OBD_CONNECT_MAXBYTES)
data->ocd_maxbytes = ofd->ofd_dt_conf.ddp_maxbytes;
if (data->ocd_connect_flags & OBD_CONNECT_MAXBYTES)
data->ocd_maxbytes = ofd->ofd_dt_conf.ddp_maxbytes;
+ if (data->ocd_connect_flags & OBD_CONNECT_PINGLESS) {
+ if (suppress_pings) {
+ spin_lock(&exp->exp_obd->obd_dev_lock);
+ list_del_init(&exp->exp_obd_chain_timed);
+ spin_unlock(&exp->exp_obd->obd_dev_lock);
+ } else {
+ data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
+ }
+ }
+
+ RETURN(0);
}
static int ofd_obd_reconnect(const struct lu_env *env, struct obd_export *exp,
}
static int ofd_obd_reconnect(const struct lu_env *env, struct obd_export *exp,
imp->imp_peer_committed_transno =
lustre_msg_get_last_committed(req->rq_repmsg);
}
imp->imp_peer_committed_transno =
lustre_msg_get_last_committed(req->rq_repmsg);
}
- ptlrpc_free_committed(imp);
- if (req->rq_transno > imp->imp_peer_committed_transno)
- ptlrpc_pinger_commit_expected(imp);
+ ptlrpc_free_committed(imp);
+
+ if (!cfs_list_empty(&imp->imp_replay_list)) {
+ struct ptlrpc_request *last;
+
+ last = cfs_list_entry(imp->imp_replay_list.prev,
+ struct ptlrpc_request,
+ rq_replay_list);
+ /*
+ * Requests with rq_replay stay on the list even if no
+ * commit is expected.
+ */
+ if (last->rq_transno > imp->imp_peer_committed_transno)
+ ptlrpc_pinger_commit_expected(imp);
+ }
spin_unlock(&imp->imp_lock);
}
spin_unlock(&imp->imp_lock);
}
#include <obd_class.h>
#include "ptlrpc_internal.h"
#include <obd_class.h>
#include "ptlrpc_internal.h"
+int suppress_pings;
+EXPORT_SYMBOL(suppress_pings);
+CFS_MODULE_PARM(suppress_pings, "i", int, 0644, "Suppress pings");
+
struct mutex pinger_mutex;
static CFS_LIST_HEAD(pinger_imports);
static cfs_list_t timeout_list = CFS_LIST_HEAD_INIT(timeout_list);
struct mutex pinger_mutex;
static CFS_LIST_HEAD(pinger_imports);
static cfs_list_t timeout_list = CFS_LIST_HEAD_INIT(timeout_list);
struct ptlrpc_request *
ptlrpc_prep_ping(struct obd_import *imp)
{
struct ptlrpc_request *
ptlrpc_prep_ping(struct obd_import *imp)
{
static void ptlrpc_pinger_process_import(struct obd_import *imp,
unsigned long this_ping)
{
static void ptlrpc_pinger_process_import(struct obd_import *imp,
unsigned long this_ping)
{
+ int level;
+ int force;
+ int force_next;
+ int suppress;
spin_lock(&imp->imp_lock);
spin_lock(&imp->imp_lock);
level = imp->imp_state;
force = imp->imp_force_verify;
level = imp->imp_state;
force = imp->imp_force_verify;
- if (force)
- imp->imp_force_verify = 0;
- spin_unlock(&imp->imp_lock);
+ force_next = imp->imp_force_next_verify;
+ /*
+ * This will be used below only if the import is "FULL".
+ */
+ suppress = !!(imp->imp_connect_data.ocd_connect_flags &
+ OBD_CONNECT_PINGLESS);
- CDEBUG(level == LUSTRE_IMP_FULL ? D_INFO : D_HA,
- "level %s/%u force %u deactive %u pingable %u\n",
- ptlrpc_import_state_name(level), level,
- force, imp->imp_deactive, imp->imp_pingable);
+ imp->imp_force_verify = 0;
- if (cfs_time_aftereq(imp->imp_next_ping - 5 * CFS_TICK,
- this_ping) && force == 0)
- return;
+ if (cfs_time_aftereq(imp->imp_next_ping - 5 * CFS_TICK, this_ping) &&
+ !force) {
+ spin_unlock(&imp->imp_lock);
+ return;
+ }
+
+ imp->imp_force_next_verify = 0;
+
+ spin_unlock(&imp->imp_lock);
+
+ CDEBUG(level == LUSTRE_IMP_FULL ? D_INFO : D_HA, "%s->%s: level %s/%u "
+ "force %u force_next %u deactive %u pingable %u suppress %u\n",
+ imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
+ ptlrpc_import_state_name(level), level, force, force_next,
+ imp->imp_deactive, imp->imp_pingable, suppress);
if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) {
/* wait for a while before trying recovery again */
if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) {
/* wait for a while before trying recovery again */
} else if (level != LUSTRE_IMP_FULL ||
imp->imp_obd->obd_no_recov ||
imp_is_deactive(imp)) {
} else if (level != LUSTRE_IMP_FULL ||
imp->imp_obd->obd_no_recov ||
imp_is_deactive(imp)) {
- CDEBUG(D_HA, "not pinging %s (in recovery "
- " or recovery disabled: %s)\n",
- obd2cli_tgt(imp->imp_obd),
- ptlrpc_import_state_name(level));
- } else if (imp->imp_pingable || force) {
- ptlrpc_ping(imp);
- }
+ CDEBUG(D_HA, "%s->%s: not pinging (in recovery "
+ "or recovery disabled: %s)\n",
+ imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
+ ptlrpc_import_state_name(level));
+ } else if ((imp->imp_pingable && !suppress) || force_next || force) {
+ ptlrpc_ping(imp);
+ }
}
static int ptlrpc_pinger_main(void *arg)
}
static int ptlrpc_pinger_main(void *arg)
l_wait_event(pinger_thread->t_ctl_waitq,
thread_is_running(pinger_thread), &lwi);
l_wait_event(pinger_thread->t_ctl_waitq,
thread_is_running(pinger_thread), &lwi);
+ if (suppress_pings)
+ CWARN("Pings will be suppressed at the request of the "
+ "administrator. The configuration shall meet the "
+ "additional requirements described in the manual. "
+ "(Search for the \"suppress_pings\" kernel module "
+ "parameter.)\n");
+
+ RETURN(0);
}
int ptlrpc_pinger_remove_timeouts(void);
}
int ptlrpc_pinger_remove_timeouts(void);
void ptlrpc_pinger_commit_expected(struct obd_import *imp)
{
void ptlrpc_pinger_commit_expected(struct obd_import *imp)
{
- ptlrpc_update_next_ping(imp, 1);
+ ptlrpc_update_next_ping(imp, 1);
+ LASSERT_SPIN_LOCKED(&imp->imp_lock);
+ /*
+ * Avoid reading stale imp_connect_data. When not sure if pings are
+ * expected or not on next connection, we assume they are not and force
+ * one anyway to guarantee the chance of updating
+ * imp_peer_committed_transno.
+ */
+ if (imp->imp_state != LUSTRE_IMP_FULL ||
+ imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_PINGLESS)
+ imp->imp_force_next_verify = 1;
}
int ptlrpc_pinger_add_import(struct obd_import *imp)
}
int ptlrpc_pinger_add_import(struct obd_import *imp)