From: Li Wei Date: Wed, 30 Jan 2013 12:38:59 +0000 (+0800) Subject: LU-2467 ptlrpc: Allow OBD_PINGs to be suppressed X-Git-Tag: 2.3.61~40 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=57267444aa67399b586b94073609c0ad9c4cb3b0 LU-2467 ptlrpc: Allow OBD_PINGs to be suppressed This patch introduces a new ptlrpc module parameter, "suppress_pings", to provide an option for reducing excessive OBD_PINGs in large clusters. The parameter affects all MDTs and OSTs on a node. It is off (zero) by default, giving a behavior identical to current implementation. If it is on (non-zero), all clients of the affected targets who understand OBD_CONNECT_PINGLESS will know, at connect time, that pings are not required. When suppressing pings, there must be an external mechanism to notify the targets of client deaths, via the targets' "evict_client" procfs entries. In addition, a highly available standalone MGS is also recommended when suppressing pings, so that clients are notified (through Imperative Recovery) of target recoveries. The changes do two basically independent things. One is initializing import and export states (i.e., imp_connect_data and exp_obd_chain_timed) according to "suppress_pings", since whether to ping or not is a property of each import-export pair. MGC pings can not be suppressed, because maintaining MGS connections is dictated by the reliance on Imperative Recovery. The other thing is changing pinger and import routines to respect the import property set earlier. (The export side does not need any change at all.) Pings are still needed to query last committed transactions if there are uncommitted requests on an import, so that resources pinned for replays can be released even when applications become idle. An early version of this patch removes imports that do not need to be pinged from pinger_imports and add them back when last committed transactions are needed or recoveries must be initiated. This version does not do that because a) the overheads of iterating through 10,000 imports are not prohibitively large---around 10 ms, b) adding imports back to pinger_imports requires the global pinger_mutex to be held, and c) the imp_lock contention added on each import is small. Change-Id: Iabc84d395c978c3f156c52aebfad83621facb4fe Signed-off-by: Li Wei Reviewed-on: http://review.whamcloud.com/5009 Reviewed-by: Lai Siyao Tested-by: Hudson Reviewed-by: Emoly Liu Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index 09594bc..92613ef 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -1219,7 +1219,8 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); OBD_CONNECT_64BITHASH | OBD_CONNECT_JOBSTATS | \ OBD_CONNECT_EINPROGRESS | \ OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_UMASK | \ - OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LAYOUTLOCK) + OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LAYOUTLOCK |\ + OBD_CONNECT_PINGLESS) #define OST_CONNECT_SUPPORTED (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \ OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \ OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \ @@ -1235,11 +1236,12 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); OBD_CONNECT_EINPROGRESS | \ OBD_CONNECT_JOBSTATS | \ OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_LVB_TYPE|\ - OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_FID) + OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_FID | \ + OBD_CONNECT_PINGLESS) #define ECHO_CONNECT_SUPPORTED (0) #define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION | OBD_CONNECT_AT | \ OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | \ - OBD_CONNECT_MNE_SWAB) + OBD_CONNECT_MNE_SWAB | OBD_CONNECT_PINGLESS) /* Features required for this version of the client to work with server */ #define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_IBITS | OBD_CONNECT_FID | \ diff --git a/lustre/include/lustre_import.h b/lustre/include/lustre_import.h index cba4647..cb1c04b 100644 --- a/lustre/include/lustre_import.h +++ b/lustre/include/lustre_import.h @@ -261,6 +261,7 @@ struct obd_import { imp_no_lock_replay:1, /* VBR: if gap was found then no lock replays */ imp_vbr_failed:1, /* recovery by versions was failed */ imp_force_verify:1, /* force an immidiate ping */ + imp_force_next_verify:1,/* force a scheduled ping */ imp_pingable:1, /* pingable */ imp_resend_replay:1, /* resend for replay */ imp_no_pinger_recover:1,/* disable normal recovery, for test only. */ diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index e5e7885..74983d0 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -2956,6 +2956,7 @@ int server_disconnect_export(struct obd_export *exp); * Pinger API (client side only) * @{ */ +extern int suppress_pings; enum timeout_event { TIMEOUT_GRANT = 1 }; diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 0db5ccd..85f0fec 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -418,7 +418,6 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) GOTO(err_ldlm, rc = -ENOENT); imp->imp_client = &obddev->obd_ldlm_client; imp->imp_connect_op = connect_op; - CFS_INIT_LIST_HEAD(&imp->imp_pinger_chain); memcpy(cli->cl_target_uuid.uuid, lustre_cfg_buf(lcfg, 1), LUSTRE_CFG_BUFLEN(lcfg, 1)); class_import_put(imp); diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 501fb38..af98592 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -218,7 +218,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt, OBD_CONNECT_FULL20 | OBD_CONNECT_64BITHASH| OBD_CONNECT_EINPROGRESS | OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE | - OBD_CONNECT_LAYOUTLOCK; + OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS; if (sbi->ll_flags & LL_SBI_SOM_PREVIEW) data->ocd_connect_flags |= OBD_CONNECT_SOM; @@ -402,7 +402,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt, OBD_CONNECT_MAXBYTES | OBD_CONNECT_EINPROGRESS | OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE | - OBD_CONNECT_LAYOUTLOCK; + OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS; if (sbi->ll_flags & LL_SBI_SOM_PREVIEW) data->ocd_connect_flags |= OBD_CONNECT_SOM; diff --git a/lustre/lod/lod_lov.c b/lustre/lod/lod_lov.c index b7460b4..1d6986b 100644 --- a/lustre/lod/lod_lov.c +++ b/lustre/lod/lod_lov.c @@ -227,7 +227,8 @@ int lod_add_device(const struct lu_env *env, struct lod_device *lod, OBD_CONNECT_SKIP_ORPHAN | OBD_CONNECT_FID | OBD_CONNECT_LVB_TYPE | - OBD_CONNECT_VERSION; + OBD_CONNECT_VERSION | + OBD_CONNECT_PINGLESS; data->ocd_group = tgt_index; ltd = &lod->lod_ost_descs; @@ -242,7 +243,8 @@ int lod_add_device(const struct lu_env *env, struct lod_device *lod, OBD_CONNECT_MDS_MDS | OBD_CONNECT_FID | OBD_CONNECT_AT | - OBD_CONNECT_FULL20; + OBD_CONNECT_FULL20 | + OBD_CONNECT_PINGLESS; /* XXX set MDS-MDS flags, remove this when running this * on client*/ data->ocd_connect_flags |= OBD_CONNECT_MDS_MDS; diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index bef495f..87c2fc7 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -5066,6 +5066,16 @@ static int mdt_connect_internal(struct obd_export *exp, return -EBADE; } + if (data->ocd_connect_flags & OBD_CONNECT_PINGLESS) { + if (suppress_pings) { + spin_lock(&exp->exp_obd->obd_dev_lock); + list_del_init(&exp->exp_obd_chain_timed); + spin_unlock(&exp->exp_obd->obd_dev_lock); + } else { + data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS; + } + } + return 0; } diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index 7c06088..cd1443c 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -1037,6 +1037,7 @@ struct obd_import *class_new_import(struct obd_device *obd) if (imp == NULL) return NULL; + CFS_INIT_LIST_HEAD(&imp->imp_pinger_chain); CFS_INIT_LIST_HEAD(&imp->imp_zombie_chain); CFS_INIT_LIST_HEAD(&imp->imp_replay_list); CFS_INIT_LIST_HEAD(&imp->imp_sending_list); diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c index ccf5a6e..8a2c429 100644 --- a/lustre/obdclass/obd_mount.c +++ b/lustre/obdclass/obd_mount.c @@ -1056,7 +1056,8 @@ static int lustre_lwp_connect(struct obd_device *lwp) data->ocd_connect_flags |= OBD_CONNECT_MDS_MDS | OBD_CONNECT_FID | OBD_CONNECT_AT | OBD_CONNECT_LRU_RESIZE | OBD_CONNECT_FULL20 | OBD_CONNECT_LVB_TYPE | - OBD_CONNECT_LIGHTWEIGHT; + OBD_CONNECT_LIGHTWEIGHT | + OBD_CONNECT_PINGLESS; OBD_ALLOC_PTR(uuid); if (uuid == NULL) GOTO(out, rc = -ENOMEM); diff --git a/lustre/ofd/ofd_obd.c b/lustre/ofd/ofd_obd.c index 1f4c108..be76426 100644 --- a/lustre/ofd/ofd_obd.c +++ b/lustre/ofd/ofd_obd.c @@ -237,7 +237,17 @@ static int ofd_parse_connect_data(const struct lu_env *env, if (data->ocd_connect_flags & OBD_CONNECT_MAXBYTES) data->ocd_maxbytes = ofd->ofd_dt_conf.ddp_maxbytes; - RETURN(0); + if (data->ocd_connect_flags & OBD_CONNECT_PINGLESS) { + if (suppress_pings) { + spin_lock(&exp->exp_obd->obd_dev_lock); + list_del_init(&exp->exp_obd_chain_timed); + spin_unlock(&exp->exp_obd->obd_dev_lock); + } else { + data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS; + } + } + + RETURN(0); } static int ofd_obd_reconnect(const struct lu_env *env, struct obd_export *exp, diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 54208b8..819276c 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -1334,10 +1334,22 @@ static int after_reply(struct ptlrpc_request *req) imp->imp_peer_committed_transno = lustre_msg_get_last_committed(req->rq_repmsg); } - ptlrpc_free_committed(imp); - if (req->rq_transno > imp->imp_peer_committed_transno) - ptlrpc_pinger_commit_expected(imp); + ptlrpc_free_committed(imp); + + if (!cfs_list_empty(&imp->imp_replay_list)) { + struct ptlrpc_request *last; + + last = cfs_list_entry(imp->imp_replay_list.prev, + struct ptlrpc_request, + rq_replay_list); + /* + * Requests with rq_replay stay on the list even if no + * commit is expected. + */ + if (last->rq_transno > imp->imp_peer_committed_transno) + ptlrpc_pinger_commit_expected(imp); + } spin_unlock(&imp->imp_lock); } diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c index 61f66b9..f5bf639 100644 --- a/lustre/ptlrpc/pinger.c +++ b/lustre/ptlrpc/pinger.c @@ -48,9 +48,14 @@ #include #include "ptlrpc_internal.h" +int suppress_pings; +EXPORT_SYMBOL(suppress_pings); +CFS_MODULE_PARM(suppress_pings, "i", int, 0644, "Suppress pings"); + struct mutex pinger_mutex; static CFS_LIST_HEAD(pinger_imports); static cfs_list_t timeout_list = CFS_LIST_HEAD_INIT(timeout_list); + struct ptlrpc_request * ptlrpc_prep_ping(struct obd_import *imp) { @@ -225,23 +230,39 @@ int ptlrpc_check_and_wait_suspend(struct ptlrpc_request *req) static void ptlrpc_pinger_process_import(struct obd_import *imp, unsigned long this_ping) { - int force, level; + int level; + int force; + int force_next; + int suppress; spin_lock(&imp->imp_lock); + level = imp->imp_state; force = imp->imp_force_verify; - if (force) - imp->imp_force_verify = 0; - spin_unlock(&imp->imp_lock); + force_next = imp->imp_force_next_verify; + /* + * This will be used below only if the import is "FULL". + */ + suppress = !!(imp->imp_connect_data.ocd_connect_flags & + OBD_CONNECT_PINGLESS); - CDEBUG(level == LUSTRE_IMP_FULL ? D_INFO : D_HA, - "level %s/%u force %u deactive %u pingable %u\n", - ptlrpc_import_state_name(level), level, - force, imp->imp_deactive, imp->imp_pingable); + imp->imp_force_verify = 0; - if (cfs_time_aftereq(imp->imp_next_ping - 5 * CFS_TICK, - this_ping) && force == 0) - return; + if (cfs_time_aftereq(imp->imp_next_ping - 5 * CFS_TICK, this_ping) && + !force) { + spin_unlock(&imp->imp_lock); + return; + } + + imp->imp_force_next_verify = 0; + + spin_unlock(&imp->imp_lock); + + CDEBUG(level == LUSTRE_IMP_FULL ? D_INFO : D_HA, "%s->%s: level %s/%u " + "force %u force_next %u deactive %u pingable %u suppress %u\n", + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd), + ptlrpc_import_state_name(level), level, force, force_next, + imp->imp_deactive, imp->imp_pingable, suppress); if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) { /* wait for a while before trying recovery again */ @@ -251,13 +272,13 @@ static void ptlrpc_pinger_process_import(struct obd_import *imp, } else if (level != LUSTRE_IMP_FULL || imp->imp_obd->obd_no_recov || imp_is_deactive(imp)) { - CDEBUG(D_HA, "not pinging %s (in recovery " - " or recovery disabled: %s)\n", - obd2cli_tgt(imp->imp_obd), - ptlrpc_import_state_name(level)); - } else if (imp->imp_pingable || force) { - ptlrpc_ping(imp); - } + CDEBUG(D_HA, "%s->%s: not pinging (in recovery " + "or recovery disabled: %s)\n", + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd), + ptlrpc_import_state_name(level)); + } else if ((imp->imp_pingable && !suppress) || force_next || force) { + ptlrpc_ping(imp); + } } static int ptlrpc_pinger_main(void *arg) @@ -372,7 +393,14 @@ int ptlrpc_start_pinger(void) l_wait_event(pinger_thread->t_ctl_waitq, thread_is_running(pinger_thread), &lwi); - RETURN(0); + if (suppress_pings) + CWARN("Pings will be suppressed at the request of the " + "administrator. The configuration shall meet the " + "additional requirements described in the manual. " + "(Search for the \"suppress_pings\" kernel module " + "parameter.)\n"); + + RETURN(0); } int ptlrpc_pinger_remove_timeouts(void); @@ -411,7 +439,17 @@ EXPORT_SYMBOL(ptlrpc_pinger_sending_on_import); void ptlrpc_pinger_commit_expected(struct obd_import *imp) { - ptlrpc_update_next_ping(imp, 1); + ptlrpc_update_next_ping(imp, 1); + LASSERT_SPIN_LOCKED(&imp->imp_lock); + /* + * Avoid reading stale imp_connect_data. When not sure if pings are + * expected or not on next connection, we assume they are not and force + * one anyway to guarantee the chance of updating + * imp_peer_committed_transno. + */ + if (imp->imp_state != LUSTRE_IMP_FULL || + imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_PINGLESS) + imp->imp_force_next_verify = 1; } int ptlrpc_pinger_add_import(struct obd_import *imp)