From: Andreas Dilger Date: Fri, 12 Mar 2021 22:32:45 +0000 (-0700) Subject: LU-14518 ptlrpc: avoid server STONITH for slow request X-Git-Tag: 2.15.91~44 X-Git-Url: https://git.whamcloud.com/gitweb?a=commitdiff_plain;h=refs%2Fchanges%2F25%2F53225%2F11;p=fs%2Flustre-release.git LU-14518 ptlrpc: avoid server STONITH for slow request If a service is temporarily overloaded, request processing times may exceed at_max for a short time. We don't want to increase at_max excessively, since that slows down client RPC resend and recovery, but we also want to avoid server STONITH because at_max is used directly by ptlrpc_svcpt_health_check() to determine if the service reports "NOT HEALTHY" and forces HA takeover. Slow request processing is not as serious as, say, an LBUG, so allow a configurable parameter at_unhealthy_factor to allow requests to exceed at_max before a service is considered unhealthy. This defaults to 3x at_max, and 0 disables service health checks. Also, importantly, it shouldn't be considered an error if the RPC *requests* are waiting a long time (that can happen if the server is overloaded, or NRS is delaying some RPCs), but only if the *service* is unable to process *any* requests in a long time. Otherwise, it will only print a warning of delayed RPCs. Add sanityn.sh test_200 to exercise related health check test Signed-off-by: Andreas Dilger Change-Id: Ifaf4454efacf5f5ec8fc24f75a49e17e5a3ebbe5 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53225 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Mikhail Pershin Reviewed-by: Sergey Cheremencev Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index e70efb1..50c8fe7 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -1766,6 +1766,8 @@ struct ptlrpc_service_part { struct ptlrpc_nrs scp_nrs_reg; /** NRS head for HP reqs; valid for services that can handle HP reqs */ struct ptlrpc_nrs *scp_nrs_hp; + /** when the last request was handled on this service */ + time64_t scp_last_request; /** AT stuff */ /** @{ */ diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 061119c..58e1160 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -765,6 +765,7 @@ struct obd_device { unsigned int obd_at_min; unsigned int obd_at_max; unsigned int obd_at_history; + unsigned int obd_at_unhealthy_factor; unsigned int obd_ldlm_enqueue_min; }; @@ -780,6 +781,11 @@ struct obd_device { struct obd_device *_obd = obd; \ _obd && _obd->obd_at_history ? _obd->obd_at_history : at_history; \ }) +#define obd_get_at_unhealthy_factor(obd) ({ \ + struct obd_device *_obd = obd; \ + _obd && _obd->obd_at_unhealthy_factor ? _obd->obd_at_unhealthy_factor :\ + at_unhealthy_factor; \ +}) extern unsigned int ldlm_enqueue_min; #define obd_get_ldlm_enqueue_min(obd) ({ \ struct obd_device *_obd = obd; \ diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index f063eeb..b82f070 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -63,6 +63,7 @@ extern unsigned int bulk_timeout; extern unsigned int at_min; extern unsigned int at_max; extern unsigned int at_history; +extern unsigned int at_unhealthy_factor; extern int at_early_margin; extern int at_extra; extern unsigned long obd_max_dirty_pages; diff --git a/lustre/mdc/lproc_mdc.c b/lustre/mdc/lproc_mdc.c index 1809e90..1396c12 100644 --- a/lustre/mdc/lproc_mdc.c +++ b/lustre/mdc/lproc_mdc.c @@ -775,6 +775,7 @@ LUSTRE_RW_ATTR(grant_shrink_interval); LUSTRE_OBD_UINT_PARAM_ATTR(at_min); LUSTRE_OBD_UINT_PARAM_ATTR(at_max); LUSTRE_OBD_UINT_PARAM_ATTR(at_history); +LUSTRE_OBD_UINT_PARAM_ATTR(at_unhealthy_factor); static struct attribute *mdc_attrs[] = { &lustre_attr_active.attr, @@ -792,6 +793,7 @@ static struct attribute *mdc_attrs[] = { &lustre_attr_at_max.attr, &lustre_attr_at_min.attr, &lustre_attr_at_history.attr, + &lustre_attr_at_unhealthy_factor.attr, NULL, }; diff --git a/lustre/mdt/mdt_lproc.c b/lustre/mdt/mdt_lproc.c index f94fd4c..e950d5c 100644 --- a/lustre/mdt/mdt_lproc.c +++ b/lustre/mdt/mdt_lproc.c @@ -1344,8 +1344,13 @@ LUSTRE_RO_ATTR(eviction_count); LUSTRE_OBD_UINT_PARAM_ATTR(at_min); LUSTRE_OBD_UINT_PARAM_ATTR(at_max); LUSTRE_OBD_UINT_PARAM_ATTR(at_history); +LUSTRE_OBD_UINT_PARAM_ATTR(at_unhealthy_factor); static struct attribute *mdt_attrs[] = { + &lustre_attr_at_min.attr, + &lustre_attr_at_max.attr, + &lustre_attr_at_history.attr, + &lustre_attr_at_unhealthy_factor.attr, &lustre_attr_tot_dirty.attr, &lustre_attr_tot_granted.attr, &lustre_attr_tot_pending.attr, @@ -1367,12 +1372,16 @@ static struct attribute *mdt_attrs[] = { &lustre_attr_enable_dir_migration.attr, &lustre_attr_enable_dir_restripe.attr, &lustre_attr_enable_dir_auto_split.attr, + &lustre_attr_enable_dmv_implicit_inherit.attr, + &lustre_attr_enable_dmv_xattr.attr, &lustre_attr_enable_parallel_rename_dir.attr, &lustre_attr_enable_parallel_rename_file.attr, &lustre_attr_enable_parallel_rename_crossdir.attr, &lustre_attr_enable_remote_dir.attr, &lustre_attr_enable_remote_dir_gid.attr, &lustre_attr_enable_remote_rename.attr, + &lustre_attr_enable_remote_subdir_mount.attr, + &lustre_attr_enable_strict_som.attr, &lustre_attr_enable_striped_dir.attr, &lustre_attr_commit_on_sharing.attr, &lustre_attr_local_recovery.attr, @@ -1381,7 +1390,6 @@ static struct attribute *mdt_attrs[] = { &lustre_attr_sync_count.attr, &lustre_attr_dom_lock.attr, &lustre_attr_dom_read_open.attr, - &lustre_attr_enable_strict_som.attr, &lustre_attr_migrate_hsm_allowed.attr, &lustre_attr_hsm_control.attr, &lustre_attr_job_cleanup_interval.attr, @@ -1391,13 +1399,7 @@ static struct attribute *mdt_attrs[] = { &lustre_attr_dir_split_delta.attr, &lustre_attr_dir_restripe_nsonly.attr, &lustre_attr_checksum_t10pi_enforce.attr, - &lustre_attr_enable_remote_subdir_mount.attr, &lustre_attr_max_mod_rpcs_in_flight.attr, - &lustre_attr_enable_dmv_implicit_inherit.attr, - &lustre_attr_at_min.attr, - &lustre_attr_at_max.attr, - &lustre_attr_at_history.attr, - &lustre_attr_enable_dmv_xattr.attr, NULL, }; diff --git a/lustre/mgs/lproc_mgs.c b/lustre/mgs/lproc_mgs.c index 86bc4c4..69a9335 100644 --- a/lustre/mgs/lproc_mgs.c +++ b/lustre/mgs/lproc_mgs.c @@ -245,6 +245,7 @@ LUSTRE_RO_ATTR(mntdev); LUSTRE_OBD_UINT_PARAM_ATTR(at_min); LUSTRE_OBD_UINT_PARAM_ATTR(at_max); LUSTRE_OBD_UINT_PARAM_ATTR(at_history); +LUSTRE_OBD_UINT_PARAM_ATTR(at_unhealthy_factor); static struct attribute *mgs_attrs[] = { &lustre_attr_fstype.attr, @@ -254,6 +255,7 @@ static struct attribute *mgs_attrs[] = { &lustre_attr_at_min.attr, &lustre_attr_at_max.attr, &lustre_attr_at_history.attr, + &lustre_attr_at_unhealthy_factor.attr, NULL, }; diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index aae8b56..5233b14 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -93,6 +93,9 @@ unsigned int at_max = 600; EXPORT_SYMBOL(at_max); unsigned int at_history = 600; EXPORT_SYMBOL(at_history); +/* Multiple of at_max when service is thought unhealthy and may be STONITH'd */ +unsigned int at_unhealthy_factor = 3; +EXPORT_SYMBOL(at_unhealthy_factor); int at_early_margin = 5; EXPORT_SYMBOL(at_early_margin); int at_extra = 30; diff --git a/lustre/obdclass/obd_sysfs.c b/lustre/obdclass/obd_sysfs.c index 029fbfd..790423e 100644 --- a/lustre/obdclass/obd_sysfs.c +++ b/lustre/obdclass/obd_sysfs.c @@ -123,6 +123,7 @@ LUSTRE_STATIC_UINT_ATTR(at_max, &at_max); LUSTRE_STATIC_UINT_ATTR(at_extra, &at_extra); LUSTRE_STATIC_UINT_ATTR(at_early_margin, &at_early_margin); LUSTRE_STATIC_UINT_ATTR(at_history, &at_history); +LUSTRE_STATIC_UINT_ATTR(at_unhealthy_factor, &at_unhealthy_factor); LUSTRE_STATIC_UINT_ATTR(enable_stats_header, &obd_enable_stats_header); LUSTRE_STATIC_UINT_ATTR(lbug_on_eviction, &obd_lbug_on_eviction); LUSTRE_STATIC_UINT_ATTR(ping_interval, &ping_interval); @@ -498,6 +499,7 @@ static struct attribute *lustre_attrs[] = { &lustre_sattr_at_extra.u.attr, &lustre_sattr_at_early_margin.u.attr, &lustre_sattr_at_history.u.attr, + &lustre_sattr_at_unhealthy_factor.u.attr, &lustre_attr_memused_max.attr, &lustre_attr_memused.attr, #ifdef HAVE_SERVER_SUPPORT diff --git a/lustre/ptlrpc/ptlrpc_internal.h b/lustre/ptlrpc/ptlrpc_internal.h index 95f5955..1feb92b 100644 --- a/lustre/ptlrpc/ptlrpc_internal.h +++ b/lustre/ptlrpc/ptlrpc_internal.h @@ -169,7 +169,7 @@ ptlrpc_nrs_req_get_nolock(struct ptlrpc_service_part *svcpt, bool hp, static inline struct ptlrpc_request * ptlrpc_nrs_req_peek_nolock(struct ptlrpc_service_part *svcpt, bool hp) { - return ptlrpc_nrs_req_get_nolock0(svcpt, hp, true, false); + return ptlrpc_nrs_req_get_nolock0(svcpt, hp, true, true); } void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req); diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index acaa065..3818c80 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -54,6 +54,9 @@ MODULE_PARM_DESC(at_max, "Adaptive timeout maximum (sec)"); module_param(at_history, int, 0644); MODULE_PARM_DESC(at_history, "Adaptive timeouts remember the slowest event that took place within this period (sec)"); +module_param(at_unhealthy_factor, int, 0644); +MODULE_PARM_DESC(at_unhealthy_factor, + "Multiple of at_max when delayed RPCs considered unhealthy"); module_param(at_early_margin, int, 0644); MODULE_PARM_DESC(at_early_margin, "How soon before an RPC deadline to send an early reply"); module_param(at_extra, int, 0644); @@ -2047,6 +2050,7 @@ ptlrpc_server_request_get(struct ptlrpc_service_part *svcpt, bool force) RETURN(NULL); got_request: + svcpt->scp_last_request = ktime_get_real_seconds(); svcpt->scp_nreqs_active++; if (req->rq_hp) svcpt->scp_nhreqs_active++; @@ -3673,42 +3677,69 @@ EXPORT_SYMBOL(ptlrpc_unregister_service); /** * Returns 0 if the service is healthy. * - * Right now, it just checks to make sure that requests aren't languishing - * in the queue. We'll use this health check to govern whether a node needs - * to be shot, so it's intentionally non-aggressive. + * Check whether requests have been waiting in the queue for an excessive + * time without being processed. Individual requests may wait in the queue + * for some time due to NRS policies, overloaded storage, etc. but the queue + * itself should continue to process some requests on a regular basis. + * + * We'll use this health check to govern whether a node needs to be shot, + * so it's intentionally non-aggressive. */ static int ptlrpc_svcpt_health_check(struct ptlrpc_service_part *svcpt) { struct ptlrpc_request *request = NULL; - struct timespec64 right_now; - struct timespec64 timediff; struct obd_device *obd = NULL; - - ktime_get_real_ts64(&right_now); + time64_t right_now; + time64_t req_waited; + time64_t svc_waited; + bool recovering; + unsigned int max; + + /* quick check without locking to handle the most common case */ + right_now = ktime_get_real_seconds(); + if (likely(right_now - svcpt->scp_last_request < obd_get_at_max(NULL))) + return 0; spin_lock(&svcpt->scp_req_lock); - /* How long has the next entry been waiting? */ if (ptlrpc_server_high_pending(svcpt, true)) request = ptlrpc_nrs_req_peek_nolock(svcpt, true); else if (ptlrpc_server_normal_pending(svcpt, true)) request = ptlrpc_nrs_req_peek_nolock(svcpt, false); + /* if no waiting requests, service idle time is irrelevant */ if (request == NULL) { spin_unlock(&svcpt->scp_req_lock); return 0; } - timediff = timespec64_sub(right_now, request->rq_arrival_time); - spin_unlock(&svcpt->scp_req_lock); - if (request->rq_export) obd = request->rq_export->exp_obd; - if ((timediff.tv_sec) > - (obd_at_off(obd) ? obd_timeout * 3 / 2 : obd_get_at_max(obd))) { - CERROR("%s: unhealthy - request has been waiting %llds\n", - svcpt->scp_service->srv_name, (s64)timediff.tv_sec); - return -1; + req_waited = right_now - request->rq_arrival_time.tv_sec; + svc_waited = right_now - svcpt->scp_last_request; + recovering = obd ? obd->obd_recovering : false; + spin_unlock(&svcpt->scp_req_lock); + + max = obd_get_at_max(obd); + if (min(req_waited, svc_waited) > max && !recovering) { + bool unhealthy = false; + + /* if at_unhealthy_factor = 0 then disable unhealthy status but + * at least print a warning if requests are stuck for a while + */ + if (obd_get_at_unhealthy_factor(obd) && + svc_waited > max * obd_get_at_unhealthy_factor(obd)) { + /* check if other CPTs in svc also unhealthy? */ + unhealthy = true; + } + + CDEBUG_LIMIT(unhealthy ? D_ERROR : D_WARNING, + "%s: %s - request waiting %llus, service %llus\n", + obd ? obd->obd_name : svcpt->scp_service->srv_name, + unhealthy ? "unhealthy" : "notice", + req_waited, svc_waited); + if (unhealthy) + return -1; } return 0; diff --git a/lustre/tests/sanityn.sh b/lustre/tests/sanityn.sh index b0d8962..baacb3c 100755 --- a/lustre/tests/sanityn.sh +++ b/lustre/tests/sanityn.sh @@ -6784,6 +6784,60 @@ test_116() { } run_test 116 "DNE: Set default LMV layout from a remote client" +test_200() { + remote_ost_nodsh && skip "remote OST with nodsh" && return + + local at_max=10 + local delay=20 + local old_at=$(do_facet ost1 "$LCTL get_param -n at_max") + local old_uf=$(do_facet ost1 "$LCTL get_param -n at_unhealthy_factor") + local old_thrds=$(do_facet ost1 \ + "$LCTL get_param -n ost.OSS.ost_io.threads_max") + local thrds=$(do_facet ost1 \ + "$LCTL get_param -n ost.OSS.ost_io.threads_started") + + if [[ -z "$old_uf" ]]; then + skip "no at_unhealthy_factor, skipping" + fi + + stack_trap "wait_osc_import_ready client ost1" + do_facet ost1 "$LCTL set_param at_max=$at_max at_unhealthy_factor=3 \ + ost.OSS.ost_io.threads_max=$thrds" + stack_trap "do_facet ost1 $LCTL set_param at_max=$old_at \ + at_unhealthy_factor=$old_uf \ + ost.OSS.ost_io.threads_max=$old_thrds" + + test_mkdir $DIR/$tdir + $LFS setstripe -i 0 -c 1 $DIR/$tdir + # delay for long enough to exceed old UNHEALTHY check, but allow + # threads to complete RPCs occasionally to keep service healthy + #define OBD_FAIL_OFD_COMMITRW_DELAY 0x1e1 + do_facet ost1 "$LCTL set_param fail_loc=0x1e1 fail_val=$delay" + + for ((i=0; i < thrds * 2; i++)); do + dd if=/dev/urandom of=$DIR/$tdir/$tfile.$i bs=4k count=1 \ + oflag=direct & + done + sync & + sleep 5 + # enough time for 2 sets of writes to be blocked for $fail_val + local end=$((SECONDS + delay)) + + sleep $at_max + while ((SECONDS < end)); do + local health=$(do_facet ost1 $LCTL get_param -n health_check) + + if [[ "$health" != "healthy" ]]; then + echo "OST health=$health" + break + fi + sleep 3 + done + (( SECONDS >= end )) || error "unhealthy state was seen" + wait +} +run_test 200 "service remains healthy while able to process request" + log "cleanup: ======================================================" # kill and wait in each test only guarentee script finish, but command in script diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index c12c9ec..c352747 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -8711,7 +8711,7 @@ _wait_osc_import_state() { if [[ $facet == client* ]]; then # During setup time, the osc might not be setup, it need wait # until list_param can return valid value. - params=$($LCTL list_param $param 2>/dev/null || true) + params=$($LCTL list_param $param 2>/dev/null | head -1) while [ -z "$params" ]; do if [ $i -ge $maxtime ]; then echo "can't get $param in $maxtime secs" @@ -8719,7 +8719,7 @@ _wait_osc_import_state() { fi sleep 1 i=$((i + 1)) - params=$($LCTL list_param $param 2>/dev/null || true) + params=$($LCTL list_param $param 2>/dev/null | head -1) done fi