From 04b2da6180d3c8eda21f7ab36c676462be041b74 Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev Date: Fri, 12 Mar 2021 12:00:37 +0300 Subject: [PATCH] LU-14516 mgc: configurable wait-to-reprocess time so we can set it shorter, for testing purposes at least. to change minimal wait time MGC module option 'mgc_requeue_timeout_min' should be used (in seconds). additionally a random value upto mgc_requeue_timeout_min is added to avoid a flood of config re-read requests from clients. if mgc_requeue_timeout_min is set to 0, then random part will be upto 1 second. ost-pools: before: 5840s, after:a 3474s sanity-flr: before: 1575s, after: 1381s sanity-quota: before: 10679s, after: 9703s Signed-off-by: Alex Zhuravlev Change-Id: Iff7dad4ba14d687b7e891a1c346397e4c370800d Reviewed-on: https://review.whamcloud.com/42020 Tested-by: jenkins Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Aurelien Degremont Reviewed-by: Sebastien Buisson Reviewed-by: James Simmons Reviewed-by: Oleg Drokin --- lustre/mgc/mgc_internal.h | 8 +++++++ lustre/mgc/mgc_request.c | 50 ++++++++++++++++++++++++++++++++---------- lustre/tests/test-framework.sh | 4 ++++ lustre/utils/obd.c | 8 +++---- 4 files changed, 54 insertions(+), 16 deletions(-) diff --git a/lustre/mgc/mgc_internal.h b/lustre/mgc/mgc_internal.h index 3657c86..2289972 100644 --- a/lustre/mgc/mgc_internal.h +++ b/lustre/mgc/mgc_internal.h @@ -43,6 +43,14 @@ int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data); int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld); +/* this timeout represents how many seconds MGC should wait before + * requeue config and recover lock to the MGS. We need to randomize this + * in order to not flood the MGS. + */ +#define MGC_TIMEOUT_MIN_SECONDS 5 + +extern unsigned int mgc_requeue_timeout_min; + static inline bool cld_is_sptlrpc(struct config_llog_data *cld) { return cld->cld_type == MGS_CFG_T_SPTLRPC; diff --git a/lustre/mgc/mgc_request.c b/lustre/mgc/mgc_request.c index 72c52de..65ecd67 100644 --- a/lustre/mgc/mgc_request.c +++ b/lustre/mgc/mgc_request.c @@ -619,13 +619,6 @@ static void do_requeue(struct config_llog_data *cld) EXIT; } -/* this timeout represents how many seconds MGC should wait before - * requeue config and recover lock to the MGS. We need to randomize this - * in order to not flood the MGS. - */ -#define MGC_TIMEOUT_MIN_SECONDS 5 -#define MGC_TIMEOUT_RAND_CENTISEC 500 - static int mgc_requeue_thread(void *data) { int rc = 0; @@ -639,7 +632,6 @@ static int mgc_requeue_thread(void *data) rq_state |= RQ_RUNNING; while (!(rq_state & RQ_STOP)) { struct config_llog_data *cld, *cld_prev; - int rand = prandom_u32_max(MGC_TIMEOUT_RAND_CENTISEC); int to; /* Any new or requeued lostlocks will change the state */ @@ -655,11 +647,11 @@ static int mgc_requeue_thread(void *data) * caused the lock revocation to finish its setup, plus some * random so everyone doesn't try to reconnect at once. */ - to = cfs_time_seconds(MGC_TIMEOUT_MIN_SECONDS * 100 + rand); - /* rand is centi-seconds */ + to = mgc_requeue_timeout_min == 0 ? 1 : mgc_requeue_timeout_min; + to = cfs_time_seconds(mgc_requeue_timeout_min) + + prandom_u32_max(cfs_time_seconds(to)); wait_event_idle_timeout(rq_waitq, - rq_state & (RQ_STOP | RQ_PRECLEANUP), - to/100); + rq_state & (RQ_STOP | RQ_PRECLEANUP), to); /* * iterate & processing through the list. for each cld, process @@ -2288,6 +2280,40 @@ static const struct obd_ops mgc_obd_ops = { .o_process_config = mgc_process_config, }; +static int mgc_param_requeue_timeout_min_set(const char *val, + cfs_kernel_param_arg_t *kp) +{ + int rc; + unsigned int num; + + rc = kstrtouint(val, 0, &num); + if (rc < 0) + return rc; + if (num > 120) + return -EINVAL; + + mgc_requeue_timeout_min = num; + + return 0; +} + +static const struct kernel_param_ops param_ops_requeue_timeout_min = { + .set = mgc_param_requeue_timeout_min_set, + .get = param_get_uint, +}; + +#define param_check_requeue_timeout_min(name, p) \ + __param_check(name, p, unsigned int) + +unsigned int mgc_requeue_timeout_min = MGC_TIMEOUT_MIN_SECONDS; +#ifdef HAVE_KERNEL_PARAM_OPS +module_param(mgc_requeue_timeout_min, requeue_timeout_min, 0644); +#else +module_param_call(mgc_requeue_timeout_min, mgc_param_requeue_timeout_min_set, + param_get_uint, ¶m_ops_requeue_timeout_min, 0644); +#endif +MODULE_PARM_DESC(mgc_requeue_timeout_min, "Minimal requeue time to refresh logs"); + static int __init mgc_init(void) { return class_register_type(&mgc_obd_ops, NULL, false, diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index f5c2aba..6566c04 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -5270,6 +5270,10 @@ init_param_vars () { TIMEOUT=$(do_facet $SINGLEMDS "lctl get_param -n timeout") log "Using TIMEOUT=$TIMEOUT" + # tune down to speed up testing on (usually) small setups + do_nodes $(comma_list $(nodes_list)) \ + "echo 1 >/sys/module/mgc/parameters/mgc_requeue_timeout_min" + osc_ensure_active $SINGLEMDS $TIMEOUT osc_ensure_active client $TIMEOUT $LCTL set_param osc.*.idle_timeout=debug diff --git a/lustre/utils/obd.c b/lustre/utils/obd.c index 5d57927..9238659 100644 --- a/lustre/utils/obd.c +++ b/lustre/utils/obd.c @@ -3714,7 +3714,7 @@ static int check_pool_cmd_result(enum lcfg_command_type cmd, char *fsname, if (rc == -ENODEV) return rc; if (rc < 0) - sleep(2); + sleep(1); cpt--; } while ((rc < 0) && (cpt > 0)); if (rc >= 0) { @@ -3733,7 +3733,7 @@ static int check_pool_cmd_result(enum lcfg_command_type cmd, char *fsname, if (rc == -ENODEV) return rc; if (rc >= 0) - sleep(2); + sleep(1); cpt--; } while ((rc >= 0) && (cpt > 0)); if (rc < 0) { @@ -3752,7 +3752,7 @@ static int check_pool_cmd_result(enum lcfg_command_type cmd, char *fsname, if (rc == -ENODEV) return rc; if (rc != 1) - sleep(2); + sleep(1); cpt--; } while ((rc != 1) && (cpt > 0)); if (rc == 1) { @@ -3770,7 +3770,7 @@ static int check_pool_cmd_result(enum lcfg_command_type cmd, char *fsname, if (rc == -ENODEV) return rc; if (rc == 1) - sleep(2); + sleep(1); cpt--; } while ((rc == 1) && (cpt > 0)); if (rc != 1) { -- 1.8.3.1