From 05a4be7c2905159e959e781b1af34d4d8c214700 Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev Date: Fri, 12 Mar 2021 12:00:37 +0300 Subject: [PATCH] LU-14516 mgc: configurable wait-to-reprocess time so we can set it shorter, for testing purposes at least. to change minimal wait time MGC module option 'mgc_requeue_timeout_min' should be used (in seconds). additionally a random value up to mgc_requeue_timeout_min is added to avoid a flood of config re-read requests from clients. if mgc_requeue_timeout_min is set to 0, then random part will be up to 1 second. ost-pools: before: 5840s, after:a 3474s sanity-flr: before: 1575s, after: 1381s sanity-quota: before: 10679s, after: 9703s Lustre-change: https://review.whamcloud.com/42020 Lustre-commit: 04b2da6180d3c8eda21f7ab36c676462be041b74 Signed-off-by: Alex Zhuravlev Change-Id: Iff7dad4ba14d687b7e891a1c346397e4c370800d Reviewed-by: Andreas Dilger Reviewed-by: Aurelien Degremont Reviewed-by: Sebastien Buisson Reviewed-by: James Simmons Reviewed-on: https://review.whamcloud.com/44788 Tested-by: jenkins Tested-by: Maloo --- lustre/mgc/mgc_internal.h | 8 +++++++ lustre/mgc/mgc_request.c | 50 ++++++++++++++++++++++++++++++++---------- lustre/tests/test-framework.sh | 4 ++++ lustre/utils/obd.c | 8 +++---- 4 files changed, 54 insertions(+), 16 deletions(-) diff --git a/lustre/mgc/mgc_internal.h b/lustre/mgc/mgc_internal.h index 50a13eb..1b9b4e3 100644 --- a/lustre/mgc/mgc_internal.h +++ b/lustre/mgc/mgc_internal.h @@ -44,6 +44,14 @@ int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data); int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld); +/* this timeout represents how many seconds MGC should wait before + * requeue config and recover lock to the MGS. We need to randomize this + * in order to not flood the MGS. + */ +#define MGC_TIMEOUT_MIN_SECONDS 5 + +extern unsigned int mgc_requeue_timeout_min; + static inline int cld_is_sptlrpc(struct config_llog_data *cld) { return cld->cld_type == CONFIG_T_SPTLRPC; diff --git a/lustre/mgc/mgc_request.c b/lustre/mgc/mgc_request.c index 0f2743e..00ee152 100644 --- a/lustre/mgc/mgc_request.c +++ b/lustre/mgc/mgc_request.c @@ -617,13 +617,6 @@ static void do_requeue(struct config_llog_data *cld) EXIT; } -/* this timeout represents how many seconds MGC should wait before - * requeue config and recover lock to the MGS. We need to randomize this - * in order to not flood the MGS. - */ -#define MGC_TIMEOUT_MIN_SECONDS 5 -#define MGC_TIMEOUT_RAND_CENTISEC 500 - static int mgc_requeue_thread(void *data) { int rc = 0; @@ -637,7 +630,6 @@ static int mgc_requeue_thread(void *data) rq_state |= RQ_RUNNING; while (!(rq_state & RQ_STOP)) { struct config_llog_data *cld, *cld_prev; - int rand = prandom_u32_max(MGC_TIMEOUT_RAND_CENTISEC); int to; /* Any new or requeued lostlocks will change the state */ @@ -653,11 +645,11 @@ static int mgc_requeue_thread(void *data) * caused the lock revocation to finish its setup, plus some * random so everyone doesn't try to reconnect at once. */ - to = cfs_time_seconds(MGC_TIMEOUT_MIN_SECONDS * 100 + rand); - /* rand is centi-seconds */ + to = mgc_requeue_timeout_min == 0 ? 1 : mgc_requeue_timeout_min; + to = cfs_time_seconds(mgc_requeue_timeout_min) + + prandom_u32_max(cfs_time_seconds(to)); wait_event_idle_timeout(rq_waitq, - rq_state & (RQ_STOP | RQ_PRECLEANUP), - to/100); + rq_state & (RQ_STOP | RQ_PRECLEANUP), to); /* * iterate & processing through the list. for each cld, process @@ -2277,6 +2269,40 @@ static const struct obd_ops mgc_obd_ops = { .o_process_config = mgc_process_config, }; +static int mgc_param_requeue_timeout_min_set(const char *val, + cfs_kernel_param_arg_t *kp) +{ + int rc; + unsigned int num; + + rc = kstrtouint(val, 0, &num); + if (rc < 0) + return rc; + if (num > 120) + return -EINVAL; + + mgc_requeue_timeout_min = num; + + return 0; +} + +static const struct kernel_param_ops param_ops_requeue_timeout_min = { + .set = mgc_param_requeue_timeout_min_set, + .get = param_get_uint, +}; + +#define param_check_requeue_timeout_min(name, p) \ + __param_check(name, p, unsigned int) + +unsigned int mgc_requeue_timeout_min = MGC_TIMEOUT_MIN_SECONDS; +#ifdef HAVE_KERNEL_PARAM_OPS +module_param(mgc_requeue_timeout_min, requeue_timeout_min, 0644); +#else +module_param_call(mgc_requeue_timeout_min, mgc_param_requeue_timeout_min_set, + param_get_uint, ¶m_ops_requeue_timeout_min, 0644); +#endif +MODULE_PARM_DESC(mgc_requeue_timeout_min, "Minimal requeue time to refresh logs"); + static int __init mgc_init(void) { return class_register_type(&mgc_obd_ops, NULL, false, NULL, diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index e5590cf..f5e380a 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -5312,6 +5312,10 @@ init_param_vars () { TIMEOUT=$(do_facet $SINGLEMDS "lctl get_param -n timeout") log "Using TIMEOUT=$TIMEOUT" + # tune down to speed up testing on (usually) small setups + do_nodes $(comma_list $(nodes_list)) \ + "echo 1 >/sys/module/mgc/parameters/mgc_requeue_timeout_min" + osc_ensure_active $SINGLEMDS $TIMEOUT osc_ensure_active client $TIMEOUT $LCTL set_param osc.*.idle_timeout=debug diff --git a/lustre/utils/obd.c b/lustre/utils/obd.c index 29bfe20..8f11d9a 100644 --- a/lustre/utils/obd.c +++ b/lustre/utils/obd.c @@ -3747,7 +3747,7 @@ static int check_pool_cmd_result(enum lcfg_command_type cmd, char *fsname, if (rc == -ENODEV) return rc; if (rc < 0) - sleep(2); + sleep(1); cpt--; } while ((rc < 0) && (cpt > 0)); if (rc >= 0) { @@ -3766,7 +3766,7 @@ static int check_pool_cmd_result(enum lcfg_command_type cmd, char *fsname, if (rc == -ENODEV) return rc; if (rc >= 0) - sleep(2); + sleep(1); cpt--; } while ((rc >= 0) && (cpt > 0)); if (rc < 0) { @@ -3785,7 +3785,7 @@ static int check_pool_cmd_result(enum lcfg_command_type cmd, char *fsname, if (rc == -ENODEV) return rc; if (rc != 1) - sleep(2); + sleep(1); cpt--; } while ((rc != 1) && (cpt > 0)); if (rc == 1) { @@ -3803,7 +3803,7 @@ static int check_pool_cmd_result(enum lcfg_command_type cmd, char *fsname, if (rc == -ENODEV) return rc; if (rc == 1) - sleep(2); + sleep(1); cpt--; } while ((rc == 1) && (cpt > 0)); if (rc != 1) { -- 1.8.3.1