struct ptlrpc_nrs scp_nrs_reg;
/** NRS head for HP reqs; valid for services that can handle HP reqs */
struct ptlrpc_nrs *scp_nrs_hp;
+ /** when the last request was handled on this service */
+ time64_t scp_last_request;
/** AT stuff */
/** @{ */
unsigned int obd_at_min;
unsigned int obd_at_max;
unsigned int obd_at_history;
+ unsigned int obd_at_unhealthy_factor;
unsigned int obd_ldlm_enqueue_min;
};
struct obd_device *_obd = obd; \
_obd && _obd->obd_at_history ? _obd->obd_at_history : at_history; \
})
+#define obd_get_at_unhealthy_factor(obd) ({ \
+ struct obd_device *_obd = obd; \
+ _obd && _obd->obd_at_unhealthy_factor ? _obd->obd_at_unhealthy_factor :\
+ at_unhealthy_factor; \
+})
extern unsigned int ldlm_enqueue_min;
#define obd_get_ldlm_enqueue_min(obd) ({ \
struct obd_device *_obd = obd; \
extern unsigned int at_min;
extern unsigned int at_max;
extern unsigned int at_history;
+extern unsigned int at_unhealthy_factor;
extern int at_early_margin;
extern int at_extra;
extern unsigned long obd_max_dirty_pages;
LUSTRE_OBD_UINT_PARAM_ATTR(at_min);
LUSTRE_OBD_UINT_PARAM_ATTR(at_max);
LUSTRE_OBD_UINT_PARAM_ATTR(at_history);
+LUSTRE_OBD_UINT_PARAM_ATTR(at_unhealthy_factor);
static struct attribute *mdc_attrs[] = {
&lustre_attr_active.attr,
&lustre_attr_at_max.attr,
&lustre_attr_at_min.attr,
&lustre_attr_at_history.attr,
+ &lustre_attr_at_unhealthy_factor.attr,
NULL,
};
LUSTRE_OBD_UINT_PARAM_ATTR(at_min);
LUSTRE_OBD_UINT_PARAM_ATTR(at_max);
LUSTRE_OBD_UINT_PARAM_ATTR(at_history);
+LUSTRE_OBD_UINT_PARAM_ATTR(at_unhealthy_factor);
static struct attribute *mdt_attrs[] = {
+ &lustre_attr_at_min.attr,
+ &lustre_attr_at_max.attr,
+ &lustre_attr_at_history.attr,
+ &lustre_attr_at_unhealthy_factor.attr,
&lustre_attr_tot_dirty.attr,
&lustre_attr_tot_granted.attr,
&lustre_attr_tot_pending.attr,
&lustre_attr_enable_dir_migration.attr,
&lustre_attr_enable_dir_restripe.attr,
&lustre_attr_enable_dir_auto_split.attr,
+ &lustre_attr_enable_dmv_implicit_inherit.attr,
+ &lustre_attr_enable_dmv_xattr.attr,
&lustre_attr_enable_parallel_rename_dir.attr,
&lustre_attr_enable_parallel_rename_file.attr,
&lustre_attr_enable_parallel_rename_crossdir.attr,
&lustre_attr_enable_remote_dir.attr,
&lustre_attr_enable_remote_dir_gid.attr,
&lustre_attr_enable_remote_rename.attr,
+ &lustre_attr_enable_remote_subdir_mount.attr,
+ &lustre_attr_enable_strict_som.attr,
&lustre_attr_enable_striped_dir.attr,
&lustre_attr_commit_on_sharing.attr,
&lustre_attr_local_recovery.attr,
&lustre_attr_sync_count.attr,
&lustre_attr_dom_lock.attr,
&lustre_attr_dom_read_open.attr,
- &lustre_attr_enable_strict_som.attr,
&lustre_attr_migrate_hsm_allowed.attr,
&lustre_attr_hsm_control.attr,
&lustre_attr_job_cleanup_interval.attr,
&lustre_attr_dir_split_delta.attr,
&lustre_attr_dir_restripe_nsonly.attr,
&lustre_attr_checksum_t10pi_enforce.attr,
- &lustre_attr_enable_remote_subdir_mount.attr,
&lustre_attr_max_mod_rpcs_in_flight.attr,
- &lustre_attr_enable_dmv_implicit_inherit.attr,
- &lustre_attr_at_min.attr,
- &lustre_attr_at_max.attr,
- &lustre_attr_at_history.attr,
- &lustre_attr_enable_dmv_xattr.attr,
NULL,
};
LUSTRE_OBD_UINT_PARAM_ATTR(at_min);
LUSTRE_OBD_UINT_PARAM_ATTR(at_max);
LUSTRE_OBD_UINT_PARAM_ATTR(at_history);
+LUSTRE_OBD_UINT_PARAM_ATTR(at_unhealthy_factor);
static struct attribute *mgs_attrs[] = {
&lustre_attr_fstype.attr,
&lustre_attr_at_min.attr,
&lustre_attr_at_max.attr,
&lustre_attr_at_history.attr,
+ &lustre_attr_at_unhealthy_factor.attr,
NULL,
};
EXPORT_SYMBOL(at_max);
unsigned int at_history = 600;
EXPORT_SYMBOL(at_history);
+/* Multiple of at_max when service is thought unhealthy and may be STONITH'd */
+unsigned int at_unhealthy_factor = 3;
+EXPORT_SYMBOL(at_unhealthy_factor);
int at_early_margin = 5;
EXPORT_SYMBOL(at_early_margin);
int at_extra = 30;
LUSTRE_STATIC_UINT_ATTR(at_extra, &at_extra);
LUSTRE_STATIC_UINT_ATTR(at_early_margin, &at_early_margin);
LUSTRE_STATIC_UINT_ATTR(at_history, &at_history);
+LUSTRE_STATIC_UINT_ATTR(at_unhealthy_factor, &at_unhealthy_factor);
LUSTRE_STATIC_UINT_ATTR(enable_stats_header, &obd_enable_stats_header);
LUSTRE_STATIC_UINT_ATTR(lbug_on_eviction, &obd_lbug_on_eviction);
LUSTRE_STATIC_UINT_ATTR(ping_interval, &ping_interval);
&lustre_sattr_at_extra.u.attr,
&lustre_sattr_at_early_margin.u.attr,
&lustre_sattr_at_history.u.attr,
+ &lustre_sattr_at_unhealthy_factor.u.attr,
&lustre_attr_memused_max.attr,
&lustre_attr_memused.attr,
#ifdef HAVE_SERVER_SUPPORT
static inline struct ptlrpc_request *
ptlrpc_nrs_req_peek_nolock(struct ptlrpc_service_part *svcpt, bool hp)
{
- return ptlrpc_nrs_req_get_nolock0(svcpt, hp, true, false);
+ return ptlrpc_nrs_req_get_nolock0(svcpt, hp, true, true);
}
void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req);
module_param(at_history, int, 0644);
MODULE_PARM_DESC(at_history,
"Adaptive timeouts remember the slowest event that took place within this period (sec)");
+module_param(at_unhealthy_factor, int, 0644);
+MODULE_PARM_DESC(at_unhealthy_factor,
+ "Multiple of at_max when delayed RPCs considered unhealthy");
module_param(at_early_margin, int, 0644);
MODULE_PARM_DESC(at_early_margin, "How soon before an RPC deadline to send an early reply");
module_param(at_extra, int, 0644);
RETURN(NULL);
got_request:
+ svcpt->scp_last_request = ktime_get_real_seconds();
svcpt->scp_nreqs_active++;
if (req->rq_hp)
svcpt->scp_nhreqs_active++;
/**
* Returns 0 if the service is healthy.
*
- * Right now, it just checks to make sure that requests aren't languishing
- * in the queue. We'll use this health check to govern whether a node needs
- * to be shot, so it's intentionally non-aggressive.
+ * Check whether requests have been waiting in the queue for an excessive
+ * time without being processed. Individual requests may wait in the queue
+ * for some time due to NRS policies, overloaded storage, etc. but the queue
+ * itself should continue to process some requests on a regular basis.
+ *
+ * We'll use this health check to govern whether a node needs to be shot,
+ * so it's intentionally non-aggressive.
*/
static int ptlrpc_svcpt_health_check(struct ptlrpc_service_part *svcpt)
{
struct ptlrpc_request *request = NULL;
- struct timespec64 right_now;
- struct timespec64 timediff;
struct obd_device *obd = NULL;
-
- ktime_get_real_ts64(&right_now);
+ time64_t right_now;
+ time64_t req_waited;
+ time64_t svc_waited;
+ bool recovering;
+ unsigned int max;
+
+ /* quick check without locking to handle the most common case */
+ right_now = ktime_get_real_seconds();
+ if (likely(right_now - svcpt->scp_last_request < obd_get_at_max(NULL)))
+ return 0;
spin_lock(&svcpt->scp_req_lock);
- /* How long has the next entry been waiting? */
if (ptlrpc_server_high_pending(svcpt, true))
request = ptlrpc_nrs_req_peek_nolock(svcpt, true);
else if (ptlrpc_server_normal_pending(svcpt, true))
request = ptlrpc_nrs_req_peek_nolock(svcpt, false);
+ /* if no waiting requests, service idle time is irrelevant */
if (request == NULL) {
spin_unlock(&svcpt->scp_req_lock);
return 0;
}
- timediff = timespec64_sub(right_now, request->rq_arrival_time);
- spin_unlock(&svcpt->scp_req_lock);
-
if (request->rq_export)
obd = request->rq_export->exp_obd;
- if ((timediff.tv_sec) >
- (obd_at_off(obd) ? obd_timeout * 3 / 2 : obd_get_at_max(obd))) {
- CERROR("%s: unhealthy - request has been waiting %llds\n",
- svcpt->scp_service->srv_name, (s64)timediff.tv_sec);
- return -1;
+ req_waited = right_now - request->rq_arrival_time.tv_sec;
+ svc_waited = right_now - svcpt->scp_last_request;
+ recovering = obd ? obd->obd_recovering : false;
+ spin_unlock(&svcpt->scp_req_lock);
+
+ max = obd_get_at_max(obd);
+ if (min(req_waited, svc_waited) > max && !recovering) {
+ bool unhealthy = false;
+
+ /* if at_unhealthy_factor = 0 then disable unhealthy status but
+ * at least print a warning if requests are stuck for a while
+ */
+ if (obd_get_at_unhealthy_factor(obd) &&
+ svc_waited > max * obd_get_at_unhealthy_factor(obd)) {
+ /* check if other CPTs in svc also unhealthy? */
+ unhealthy = true;
+ }
+
+ CDEBUG_LIMIT(unhealthy ? D_ERROR : D_WARNING,
+ "%s: %s - request waiting %llus, service %llus\n",
+ obd ? obd->obd_name : svcpt->scp_service->srv_name,
+ unhealthy ? "unhealthy" : "notice",
+ req_waited, svc_waited);
+ if (unhealthy)
+ return -1;
}
return 0;
}
run_test 116 "DNE: Set default LMV layout from a remote client"
+test_200() {
+ remote_ost_nodsh && skip "remote OST with nodsh" && return
+
+ local at_max=10
+ local delay=20
+ local old_at=$(do_facet ost1 "$LCTL get_param -n at_max")
+ local old_uf=$(do_facet ost1 "$LCTL get_param -n at_unhealthy_factor")
+ local old_thrds=$(do_facet ost1 \
+ "$LCTL get_param -n ost.OSS.ost_io.threads_max")
+ local thrds=$(do_facet ost1 \
+ "$LCTL get_param -n ost.OSS.ost_io.threads_started")
+
+ if [[ -z "$old_uf" ]]; then
+ skip "no at_unhealthy_factor, skipping"
+ fi
+
+ stack_trap "wait_osc_import_ready client ost1"
+ do_facet ost1 "$LCTL set_param at_max=$at_max at_unhealthy_factor=3 \
+ ost.OSS.ost_io.threads_max=$thrds"
+ stack_trap "do_facet ost1 $LCTL set_param at_max=$old_at \
+ at_unhealthy_factor=$old_uf \
+ ost.OSS.ost_io.threads_max=$old_thrds"
+
+ test_mkdir $DIR/$tdir
+ $LFS setstripe -i 0 -c 1 $DIR/$tdir
+ # delay for long enough to exceed old UNHEALTHY check, but allow
+ # threads to complete RPCs occasionally to keep service healthy
+ #define OBD_FAIL_OFD_COMMITRW_DELAY 0x1e1
+ do_facet ost1 "$LCTL set_param fail_loc=0x1e1 fail_val=$delay"
+
+ for ((i=0; i < thrds * 2; i++)); do
+ dd if=/dev/urandom of=$DIR/$tdir/$tfile.$i bs=4k count=1 \
+ oflag=direct &
+ done
+ sync &
+ sleep 5
+ # enough time for 2 sets of writes to be blocked for $fail_val
+ local end=$((SECONDS + delay))
+
+ sleep $at_max
+ while ((SECONDS < end)); do
+ local health=$(do_facet ost1 $LCTL get_param -n health_check)
+
+ if [[ "$health" != "healthy" ]]; then
+ echo "OST health=$health"
+ break
+ fi
+ sleep 3
+ done
+ (( SECONDS >= end )) || error "unhealthy state was seen"
+ wait
+}
+run_test 200 "service remains healthy while able to process request"
+
log "cleanup: ======================================================"
# kill and wait in each test only guarentee script finish, but command in script
if [[ $facet == client* ]]; then
# During setup time, the osc might not be setup, it need wait
# until list_param can return valid value.
- params=$($LCTL list_param $param 2>/dev/null || true)
+ params=$($LCTL list_param $param 2>/dev/null | head -1)
while [ -z "$params" ]; do
if [ $i -ge $maxtime ]; then
echo "can't get $param in $maxtime secs"
fi
sleep 1
i=$((i + 1))
- params=$($LCTL list_param $param 2>/dev/null || true)
+ params=$($LCTL list_param $param 2>/dev/null | head -1)
done
fi