Whamcloud - gitweb
LU-14518 ptlrpc: avoid server STONITH for slow request 25/53225/11
authorAndreas Dilger <adilger@whamcloud.com>
Fri, 12 Mar 2021 22:32:45 +0000 (15:32 -0700)
committerOleg Drokin <green@whamcloud.com>
Sun, 8 Sep 2024 16:03:08 +0000 (16:03 +0000)
If a service is temporarily overloaded, request processing times
may exceed at_max for a short time.  We don't want to increase
at_max excessively, since that slows down client RPC resend and
recovery, but we also want to avoid server STONITH because at_max
is used directly by ptlrpc_svcpt_health_check() to determine if
the service reports "NOT HEALTHY" and forces HA takeover.

Slow request processing is not as serious as, say, an LBUG, so
allow a configurable parameter at_unhealthy_factor to allow
requests to exceed at_max before a service is considered unhealthy.
This defaults to 3x at_max, and 0 disables service health checks.

Also, importantly, it shouldn't be considered an error if the
RPC *requests* are waiting a long time (that can happen if the
server is overloaded, or NRS is delaying some RPCs), but only
if the *service* is unable to process *any* requests in a long
time.  Otherwise, it will only print a warning of delayed RPCs.

Add sanityn.sh test_200 to exercise related health check test

Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Change-Id: Ifaf4454efacf5f5ec8fc24f75a49e17e5a3ebbe5
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53225
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Mikhail Pershin <mpershin@whamcloud.com>
Reviewed-by: Sergey Cheremencev <scherementsev@ddn.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
12 files changed:
lustre/include/lustre_net.h
lustre/include/obd.h
lustre/include/obd_support.h
lustre/mdc/lproc_mdc.c
lustre/mdt/mdt_lproc.c
lustre/mgs/lproc_mgs.c
lustre/obdclass/class_obd.c
lustre/obdclass/obd_sysfs.c
lustre/ptlrpc/ptlrpc_internal.h
lustre/ptlrpc/service.c
lustre/tests/sanityn.sh
lustre/tests/test-framework.sh

index e70efb1..50c8fe7 100644 (file)
@@ -1766,6 +1766,8 @@ struct ptlrpc_service_part {
        struct ptlrpc_nrs               scp_nrs_reg;
        /** NRS head for HP reqs; valid for services that can handle HP reqs */
        struct ptlrpc_nrs              *scp_nrs_hp;
+       /** when the last request was handled on this service */
+       time64_t                        scp_last_request;
 
        /** AT stuff */
        /** @{ */
index 061119c..58e1160 100644 (file)
@@ -765,6 +765,7 @@ struct obd_device {
        unsigned int                    obd_at_min;
        unsigned int                    obd_at_max;
        unsigned int                    obd_at_history;
+       unsigned int                    obd_at_unhealthy_factor;
        unsigned int                    obd_ldlm_enqueue_min;
 };
 
@@ -780,6 +781,11 @@ struct obd_device {
        struct obd_device *_obd = obd; \
        _obd && _obd->obd_at_history ? _obd->obd_at_history : at_history; \
 })
+#define obd_get_at_unhealthy_factor(obd) ({ \
+       struct obd_device *_obd = obd; \
+       _obd && _obd->obd_at_unhealthy_factor ? _obd->obd_at_unhealthy_factor :\
+                                               at_unhealthy_factor; \
+})
 extern unsigned int ldlm_enqueue_min;
 #define obd_get_ldlm_enqueue_min(obd) ({ \
        struct obd_device *_obd = obd; \
index f063eeb..b82f070 100644 (file)
@@ -63,6 +63,7 @@ extern unsigned int bulk_timeout;
 extern unsigned int at_min;
 extern unsigned int at_max;
 extern unsigned int at_history;
+extern unsigned int at_unhealthy_factor;
 extern int at_early_margin;
 extern int at_extra;
 extern unsigned long obd_max_dirty_pages;
index 1809e90..1396c12 100644 (file)
@@ -775,6 +775,7 @@ LUSTRE_RW_ATTR(grant_shrink_interval);
 LUSTRE_OBD_UINT_PARAM_ATTR(at_min);
 LUSTRE_OBD_UINT_PARAM_ATTR(at_max);
 LUSTRE_OBD_UINT_PARAM_ATTR(at_history);
+LUSTRE_OBD_UINT_PARAM_ATTR(at_unhealthy_factor);
 
 static struct attribute *mdc_attrs[] = {
        &lustre_attr_active.attr,
@@ -792,6 +793,7 @@ static struct attribute *mdc_attrs[] = {
        &lustre_attr_at_max.attr,
        &lustre_attr_at_min.attr,
        &lustre_attr_at_history.attr,
+       &lustre_attr_at_unhealthy_factor.attr,
        NULL,
 };
 
index f94fd4c..e950d5c 100644 (file)
@@ -1344,8 +1344,13 @@ LUSTRE_RO_ATTR(eviction_count);
 LUSTRE_OBD_UINT_PARAM_ATTR(at_min);
 LUSTRE_OBD_UINT_PARAM_ATTR(at_max);
 LUSTRE_OBD_UINT_PARAM_ATTR(at_history);
+LUSTRE_OBD_UINT_PARAM_ATTR(at_unhealthy_factor);
 
 static struct attribute *mdt_attrs[] = {
+       &lustre_attr_at_min.attr,
+       &lustre_attr_at_max.attr,
+       &lustre_attr_at_history.attr,
+       &lustre_attr_at_unhealthy_factor.attr,
        &lustre_attr_tot_dirty.attr,
        &lustre_attr_tot_granted.attr,
        &lustre_attr_tot_pending.attr,
@@ -1367,12 +1372,16 @@ static struct attribute *mdt_attrs[] = {
        &lustre_attr_enable_dir_migration.attr,
        &lustre_attr_enable_dir_restripe.attr,
        &lustre_attr_enable_dir_auto_split.attr,
+       &lustre_attr_enable_dmv_implicit_inherit.attr,
+       &lustre_attr_enable_dmv_xattr.attr,
        &lustre_attr_enable_parallel_rename_dir.attr,
        &lustre_attr_enable_parallel_rename_file.attr,
        &lustre_attr_enable_parallel_rename_crossdir.attr,
        &lustre_attr_enable_remote_dir.attr,
        &lustre_attr_enable_remote_dir_gid.attr,
        &lustre_attr_enable_remote_rename.attr,
+       &lustre_attr_enable_remote_subdir_mount.attr,
+       &lustre_attr_enable_strict_som.attr,
        &lustre_attr_enable_striped_dir.attr,
        &lustre_attr_commit_on_sharing.attr,
        &lustre_attr_local_recovery.attr,
@@ -1381,7 +1390,6 @@ static struct attribute *mdt_attrs[] = {
        &lustre_attr_sync_count.attr,
        &lustre_attr_dom_lock.attr,
        &lustre_attr_dom_read_open.attr,
-       &lustre_attr_enable_strict_som.attr,
        &lustre_attr_migrate_hsm_allowed.attr,
        &lustre_attr_hsm_control.attr,
        &lustre_attr_job_cleanup_interval.attr,
@@ -1391,13 +1399,7 @@ static struct attribute *mdt_attrs[] = {
        &lustre_attr_dir_split_delta.attr,
        &lustre_attr_dir_restripe_nsonly.attr,
        &lustre_attr_checksum_t10pi_enforce.attr,
-       &lustre_attr_enable_remote_subdir_mount.attr,
        &lustre_attr_max_mod_rpcs_in_flight.attr,
-       &lustre_attr_enable_dmv_implicit_inherit.attr,
-       &lustre_attr_at_min.attr,
-       &lustre_attr_at_max.attr,
-       &lustre_attr_at_history.attr,
-       &lustre_attr_enable_dmv_xattr.attr,
        NULL,
 };
 
index 86bc4c4..69a9335 100644 (file)
@@ -245,6 +245,7 @@ LUSTRE_RO_ATTR(mntdev);
 LUSTRE_OBD_UINT_PARAM_ATTR(at_min);
 LUSTRE_OBD_UINT_PARAM_ATTR(at_max);
 LUSTRE_OBD_UINT_PARAM_ATTR(at_history);
+LUSTRE_OBD_UINT_PARAM_ATTR(at_unhealthy_factor);
 
 static struct attribute *mgs_attrs[] = {
        &lustre_attr_fstype.attr,
@@ -254,6 +255,7 @@ static struct attribute *mgs_attrs[] = {
        &lustre_attr_at_min.attr,
        &lustre_attr_at_max.attr,
        &lustre_attr_at_history.attr,
+       &lustre_attr_at_unhealthy_factor.attr,
        NULL,
 };
 
index aae8b56..5233b14 100644 (file)
@@ -93,6 +93,9 @@ unsigned int at_max = 600;
 EXPORT_SYMBOL(at_max);
 unsigned int at_history = 600;
 EXPORT_SYMBOL(at_history);
+/* Multiple of at_max when service is thought unhealthy and may be STONITH'd */
+unsigned int at_unhealthy_factor = 3;
+EXPORT_SYMBOL(at_unhealthy_factor);
 int at_early_margin = 5;
 EXPORT_SYMBOL(at_early_margin);
 int at_extra = 30;
index 029fbfd..790423e 100644 (file)
@@ -123,6 +123,7 @@ LUSTRE_STATIC_UINT_ATTR(at_max, &at_max);
 LUSTRE_STATIC_UINT_ATTR(at_extra, &at_extra);
 LUSTRE_STATIC_UINT_ATTR(at_early_margin, &at_early_margin);
 LUSTRE_STATIC_UINT_ATTR(at_history, &at_history);
+LUSTRE_STATIC_UINT_ATTR(at_unhealthy_factor, &at_unhealthy_factor);
 LUSTRE_STATIC_UINT_ATTR(enable_stats_header, &obd_enable_stats_header);
 LUSTRE_STATIC_UINT_ATTR(lbug_on_eviction, &obd_lbug_on_eviction);
 LUSTRE_STATIC_UINT_ATTR(ping_interval, &ping_interval);
@@ -498,6 +499,7 @@ static struct attribute *lustre_attrs[] = {
        &lustre_sattr_at_extra.u.attr,
        &lustre_sattr_at_early_margin.u.attr,
        &lustre_sattr_at_history.u.attr,
+       &lustre_sattr_at_unhealthy_factor.u.attr,
        &lustre_attr_memused_max.attr,
        &lustre_attr_memused.attr,
 #ifdef HAVE_SERVER_SUPPORT
index 95f5955..1feb92b 100644 (file)
@@ -169,7 +169,7 @@ ptlrpc_nrs_req_get_nolock(struct ptlrpc_service_part *svcpt, bool hp,
 static inline struct ptlrpc_request *
 ptlrpc_nrs_req_peek_nolock(struct ptlrpc_service_part *svcpt, bool hp)
 {
-       return ptlrpc_nrs_req_get_nolock0(svcpt, hp, true, false);
+       return ptlrpc_nrs_req_get_nolock0(svcpt, hp, true, true);
 }
 
 void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req);
index acaa065..3818c80 100644 (file)
@@ -54,6 +54,9 @@ MODULE_PARM_DESC(at_max, "Adaptive timeout maximum (sec)");
 module_param(at_history, int, 0644);
 MODULE_PARM_DESC(at_history,
                 "Adaptive timeouts remember the slowest event that took place within this period (sec)");
+module_param(at_unhealthy_factor, int, 0644);
+MODULE_PARM_DESC(at_unhealthy_factor,
+                "Multiple of at_max when delayed RPCs considered unhealthy");
 module_param(at_early_margin, int, 0644);
 MODULE_PARM_DESC(at_early_margin, "How soon before an RPC deadline to send an early reply");
 module_param(at_extra, int, 0644);
@@ -2047,6 +2050,7 @@ ptlrpc_server_request_get(struct ptlrpc_service_part *svcpt, bool force)
        RETURN(NULL);
 
 got_request:
+       svcpt->scp_last_request = ktime_get_real_seconds();
        svcpt->scp_nreqs_active++;
        if (req->rq_hp)
                svcpt->scp_nhreqs_active++;
@@ -3673,42 +3677,69 @@ EXPORT_SYMBOL(ptlrpc_unregister_service);
 /**
  * Returns 0 if the service is healthy.
  *
- * Right now, it just checks to make sure that requests aren't languishing
- * in the queue.  We'll use this health check to govern whether a node needs
- * to be shot, so it's intentionally non-aggressive.
+ * Check whether requests have been waiting in the queue for an excessive
+ * time without being processed.  Individual requests may wait in the queue
+ * for some time due to NRS policies, overloaded storage, etc. but the queue
+ * itself should continue to process some requests on a regular basis.
+ *
+ * We'll use this health check to govern whether a node needs to be shot,
+ * so it's intentionally non-aggressive.
  */
 static int ptlrpc_svcpt_health_check(struct ptlrpc_service_part *svcpt)
 {
        struct ptlrpc_request *request = NULL;
-       struct timespec64 right_now;
-       struct timespec64 timediff;
        struct obd_device *obd = NULL;
-
-       ktime_get_real_ts64(&right_now);
+       time64_t right_now;
+       time64_t req_waited;
+       time64_t svc_waited;
+       bool recovering;
+       unsigned int max;
+
+       /* quick check without locking to handle the most common case */
+       right_now = ktime_get_real_seconds();
+       if (likely(right_now - svcpt->scp_last_request < obd_get_at_max(NULL)))
+               return 0;
 
        spin_lock(&svcpt->scp_req_lock);
-       /* How long has the next entry been waiting? */
        if (ptlrpc_server_high_pending(svcpt, true))
                request = ptlrpc_nrs_req_peek_nolock(svcpt, true);
        else if (ptlrpc_server_normal_pending(svcpt, true))
                request = ptlrpc_nrs_req_peek_nolock(svcpt, false);
 
+       /* if no waiting requests, service idle time is irrelevant */
        if (request == NULL) {
                spin_unlock(&svcpt->scp_req_lock);
                return 0;
        }
 
-       timediff = timespec64_sub(right_now, request->rq_arrival_time);
-       spin_unlock(&svcpt->scp_req_lock);
-
        if (request->rq_export)
                obd = request->rq_export->exp_obd;
 
-       if ((timediff.tv_sec) >
-           (obd_at_off(obd) ? obd_timeout * 3 / 2 : obd_get_at_max(obd))) {
-               CERROR("%s: unhealthy - request has been waiting %llds\n",
-                      svcpt->scp_service->srv_name, (s64)timediff.tv_sec);
-               return -1;
+       req_waited = right_now - request->rq_arrival_time.tv_sec;
+       svc_waited = right_now - svcpt->scp_last_request;
+       recovering = obd ? obd->obd_recovering : false;
+       spin_unlock(&svcpt->scp_req_lock);
+
+       max = obd_get_at_max(obd);
+       if (min(req_waited, svc_waited) > max && !recovering) {
+               bool unhealthy = false;
+
+               /* if at_unhealthy_factor = 0 then disable unhealthy status but
+                * at least print a warning if requests are stuck for a while
+                */
+               if (obd_get_at_unhealthy_factor(obd) &&
+                   svc_waited > max * obd_get_at_unhealthy_factor(obd)) {
+                       /* check if other CPTs in svc also unhealthy? */
+                       unhealthy = true;
+               }
+
+               CDEBUG_LIMIT(unhealthy ? D_ERROR : D_WARNING,
+                            "%s: %s - request waiting %llus, service %llus\n",
+                            obd ? obd->obd_name : svcpt->scp_service->srv_name,
+                            unhealthy ? "unhealthy" : "notice",
+                            req_waited, svc_waited);
+               if (unhealthy)
+                       return -1;
        }
 
        return 0;
index b0d8962..baacb3c 100755 (executable)
@@ -6784,6 +6784,60 @@ test_116() {
 }
 run_test 116 "DNE: Set default LMV layout from a remote client"
 
+test_200() {
+       remote_ost_nodsh && skip "remote OST with nodsh" && return
+
+       local at_max=10
+       local delay=20
+       local old_at=$(do_facet ost1 "$LCTL get_param -n at_max")
+       local old_uf=$(do_facet ost1 "$LCTL get_param -n at_unhealthy_factor")
+       local old_thrds=$(do_facet ost1 \
+               "$LCTL get_param -n ost.OSS.ost_io.threads_max")
+       local thrds=$(do_facet ost1 \
+               "$LCTL get_param -n ost.OSS.ost_io.threads_started")
+
+       if [[ -z "$old_uf" ]]; then
+               skip "no at_unhealthy_factor, skipping"
+       fi
+
+       stack_trap "wait_osc_import_ready client ost1"
+       do_facet ost1 "$LCTL set_param at_max=$at_max at_unhealthy_factor=3 \
+                      ost.OSS.ost_io.threads_max=$thrds"
+       stack_trap "do_facet ost1 $LCTL set_param at_max=$old_at \
+                                       at_unhealthy_factor=$old_uf \
+                                       ost.OSS.ost_io.threads_max=$old_thrds"
+
+       test_mkdir $DIR/$tdir
+       $LFS setstripe -i 0 -c 1 $DIR/$tdir
+       # delay for long enough to exceed old UNHEALTHY check, but allow
+       # threads to complete RPCs occasionally to keep service healthy
+       #define OBD_FAIL_OFD_COMMITRW_DELAY 0x1e1
+       do_facet ost1 "$LCTL set_param fail_loc=0x1e1 fail_val=$delay"
+
+       for ((i=0; i < thrds * 2; i++)); do
+               dd if=/dev/urandom of=$DIR/$tdir/$tfile.$i bs=4k count=1 \
+                       oflag=direct &
+       done
+       sync &
+       sleep 5
+       # enough time for 2 sets of writes to be blocked for $fail_val
+       local end=$((SECONDS + delay))
+
+       sleep $at_max
+       while ((SECONDS < end)); do
+               local health=$(do_facet ost1 $LCTL get_param -n health_check)
+
+               if [[ "$health" != "healthy" ]]; then
+                       echo "OST health=$health"
+                       break
+               fi
+               sleep 3
+       done
+       (( SECONDS >= end )) || error "unhealthy state was seen"
+       wait
+}
+run_test 200 "service remains healthy while able to process request"
+
 log "cleanup: ======================================================"
 
 # kill and wait in each test only guarentee script finish, but command in script
index c12c9ec..c352747 100755 (executable)
@@ -8711,7 +8711,7 @@ _wait_osc_import_state() {
        if [[ $facet == client* ]]; then
                # During setup time, the osc might not be setup, it need wait
                # until list_param can return valid value.
-               params=$($LCTL list_param $param 2>/dev/null || true)
+               params=$($LCTL list_param $param 2>/dev/null | head -1)
                while [ -z "$params" ]; do
                        if [ $i -ge $maxtime ]; then
                                echo "can't get $param in $maxtime secs"
@@ -8719,7 +8719,7 @@ _wait_osc_import_state() {
                        fi
                        sleep 1
                        i=$((i + 1))
-                       params=$($LCTL list_param $param 2>/dev/null || true)
+                       params=$($LCTL list_param $param 2>/dev/null | head -1)
                done
        fi