Whamcloud - gitweb
LU-17216 ofd: make enable_health_write tunable 82/52782/14
authorTimothy Day <timday@amazon.com>
Sat, 21 Oct 2023 19:11:42 +0000 (19:11 +0000)
committerOleg Drokin <green@whamcloud.com>
Wed, 29 Nov 2023 21:28:46 +0000 (21:28 +0000)
enable_health_write should be tunable rather than a
compilation option. This allows us to test it more
easily and gives admins the option to try it out
without having to recompile their Lustre servers.
It will still be disabled by default.

Add sanity/70 test to run a simple check to ensure
enable_health_write and health_check don't explode.
It's not a thorough check. But it at least checks
that the interfaces appear to work.

Signed-off-by: Timothy Day <timday@amazon.com>
Change-Id: I7b1832f8acf578b891386e28c5af760070a6862c
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/52782
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
autoMakefile.am
debian/rules
lustre/autoconf/lustre-core.m4
lustre/include/obd_support.h
lustre/obdclass/obd_sysfs.c
lustre/ofd/ofd_obd.c
lustre/tests/sanity.sh

index a7eea36..fdac9b2 100644 (file)
@@ -356,11 +356,6 @@ debs: undef.h debs_common
        elif test "x@ENABLE_FLOCK@" = "xno"; then \
                export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} noflock"; \
        fi; \
-       if test "x@ENABLE_HEALTH_WRITE@" = "xyes"; then \
-               export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} health_write"; \
-       elif test "x@ENABLE_HEALTH_WRITE@" = "xno"; then \
-               export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} nohealth_write"; \
-       fi; \
        if test "x@ENABLE_LRU_RESIZE@" = "xyes"; then \
                export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} lru-resize"; \
        elif test "x@ENABLE_LRU_RESIZE@" = "xno"; then \
index 1aabf7e..35e775f 100755 (executable)
@@ -194,7 +194,7 @@ configure-stamp: autogen-stamp debian/control.main debian/control.modules.in
        if echo "$${DEB_BUILD_PROFILES}" | grep -q "o2ib"; then \
                export EXTRAFLAGS="$${EXTRAFLAGS} --with-o2ib=$${O2IB_SRC}"; \
        fi; \
-       options="gss crypto pinger checksum flock health_write lru-resize"; \
+       options="gss crypto pinger checksum flock lru-resize"; \
        options="$${options} mindf fail-alloc invariants lu_ref pgstate-track"; \
        options="$${options} libcfs-cdebug libcfs-trace libcfs-assert"; \
        options="$${options} panic_dumplog readline libpthread"; \
@@ -495,7 +495,7 @@ kdist_config: prep-deb-files patch-stamp
        if echo "$${DEB_BUILD_PROFILES}" | grep -q "o2ib"; then \
                export EXTRAFLAGS="$${EXTRAFLAGS} --with-o2ib=$${O2IB_SRC}"; \
        fi; \
-       options="gss crypto pinger checksum flock health_write lru-resize"; \
+       options="gss crypto pinger checksum flock lru-resize"; \
        options="$${options} mindf fail-alloc invariants lu_ref pgstate-track"; \
        options="$${options} libcfs-cdebug libcfs-trace libcfs-assert"; \
        options="$${options} panic_dumplog readline libpthread"; \
index 664419b..32a423e 100644 (file)
@@ -189,26 +189,6 @@ AS_IF([test "x$enable_flock" != xno], [
 ]) # LC_CONFIG_FLOCK
 
 #
-# LC_CONFIG_HEALTH_CHECK_WRITE
-#
-# Turn off the actual write to the disk
-#
-AC_DEFUN([LC_CONFIG_HEALTH_CHECK_WRITE], [
-AC_MSG_CHECKING([whether to enable a write with the health check])
-AC_ARG_ENABLE([health_write],
-       AS_HELP_STRING([--enable-health_write],
-               [enable disk writes when doing health check]),
-       [], [enable_health_write="no"])
-AC_MSG_RESULT([$enable_health_write])
-AS_IF([test "x$enable_health_write" != xno], [
-       AC_DEFINE(USE_HEALTH_CHECK_WRITE, 1, [Write when Checking Health])
-       AC_SUBST(ENABLE_HEALTH_WRITE, yes)
-], [
-       AC_SUBST(ENABLE_HEALTH_WRITE, no)
-])
-]) # LC_CONFIG_HEALTH_CHECK_WRITE
-
-#
 # LC_CONFIG_LRU_RESIZE
 #
 AC_DEFUN([LC_CONFIG_LRU_RESIZE], [
index d91eea8..bf64d21 100644 (file)
@@ -70,6 +70,7 @@ extern int at_extra;
 extern unsigned long obd_max_dirty_pages;
 extern atomic_long_t obd_dirty_pages;
 extern char obd_jobid_var[];
+extern bool obd_enable_health_write;
 
 /* Some hash init argument constants */
 #define HASH_NID_STATS_BKT_BITS 5
index 02f6642..d0f09aa 100644 (file)
@@ -70,6 +70,9 @@
 #include <uapi/linux/lustre/lustre_ioctl.h>
 #include <uapi/linux/lustre/lustre_ver.h>
 
+bool obd_enable_health_write;
+EXPORT_SYMBOL(obd_enable_health_write);
+
 struct static_lustre_uintvalue_attr {
        struct {
                struct attribute attr;
@@ -269,6 +272,30 @@ health_check_show(struct kobject *kobj, struct attribute *attr, char *buf)
        return len;
 }
 
+#ifdef HAVE_SERVER_SUPPORT
+static ssize_t enable_health_write_show(struct kobject *kobj,
+                                       struct attribute *attr,
+                                       char *buf)
+{
+       return scnprintf(buf, PAGE_SIZE, "%u\n",
+                        obd_enable_health_write);
+}
+
+static ssize_t enable_health_write_store(struct kobject *kobj,
+                                        struct attribute *attr,
+                                        const char *buf, size_t count)
+{
+       int rc = 0;
+
+       rc = kstrtobool(buf, &obd_enable_health_write);
+       if (rc)
+               return rc;
+
+       return count;
+}
+LUSTRE_RW_ATTR(enable_health_write);
+#endif /* HAVE_SERVER_SUPPORT */
+
 static ssize_t jobid_var_show(struct kobject *kobj, struct attribute *attr,
                              char *buf)
 {
@@ -437,6 +464,7 @@ static struct attribute *lustre_attrs[] = {
        &lustre_attr_memused_max.attr,
        &lustre_attr_memused.attr,
 #ifdef HAVE_SERVER_SUPPORT
+       &lustre_attr_enable_health_write.attr,
        &lustre_sattr_ldlm_timeout.u.attr,
        &lustre_sattr_bulk_timeout.u.attr,
        &lustre_attr_no_transno.attr,
index acf0bcf..e70b375 100644 (file)
@@ -48,6 +48,7 @@
 #include <lustre_quota.h>
 #include <lustre_lfsck.h>
 #include <lustre_nodemap.h>
+#include <obd_support.h>
 
 /**
  * Initialize OFD per-export statistics.
@@ -1347,9 +1348,8 @@ static int ofd_precleanup(struct obd_device *obd)
  * - get statfs from the OSD. It checks just responsiveness of
  *   bottom device
  * - do write attempt on bottom device to check it is fully operational and
- *   is not stuck. This is expensive method and requires special configuration
- *   option --enable-health-write while building Lustre, it is turned off
- *   by default.
+ *   is not stuck. This is expensive method and must be manually enabled
+ *   by setting the enable_health_write sysfs parameter.
  *
  * \param[in] nul      not used
  * \param[in] obd      OBD device of OFD
@@ -1362,9 +1362,8 @@ static int ofd_health_check(const struct lu_env *nul, struct obd_device *obd)
        struct ofd_device *ofd = ofd_dev(obd->obd_lu_dev);
        struct ofd_thread_info *info;
        struct lu_env env;
-#ifdef USE_HEALTH_CHECK_WRITE
        struct thandle *th;
-#endif
+       int rc1 = 0;
        int rc = 0;
 
        /* obd_proc_read_health pass NULL env, we need real one */
@@ -1380,7 +1379,9 @@ static int ofd_health_check(const struct lu_env *nul, struct obd_device *obd)
        if (info->fti_u.osfs.os_state & OS_STATFS_READONLY)
                GOTO(out, rc = -EROFS);
 
-#ifdef USE_HEALTH_CHECK_WRITE
+       if (!obd_enable_health_write)
+               goto out;
+
        OBD_ALLOC(info->fti_buf.lb_buf, PAGE_SIZE);
        if (!info->fti_buf.lb_buf)
                GOTO(out, rc = -ENOMEM);
@@ -1394,21 +1395,28 @@ static int ofd_health_check(const struct lu_env *nul, struct obd_device *obd)
 
        rc = dt_declare_record_write(&env, ofd->ofd_health_check_file,
                                     &info->fti_buf, info->fti_off, th);
-       if (rc == 0) {
-               th->th_sync = 1; /* sync IO is needed */
-               rc = dt_trans_start_local(&env, ofd->ofd_osd, th);
-               if (rc == 0)
-                       rc = dt_record_write(&env, ofd->ofd_health_check_file,
-                                            &info->fti_buf, &info->fti_off,
-                                            th);
-       }
-       dt_trans_stop(&env, ofd->ofd_osd, th);
+       if (rc)
+               goto out_stop;
+
+       th->th_sync = 1; /* sync IO is needed */
+       rc = dt_trans_start_local(&env, ofd->ofd_osd, th);
+
+       if (rc)
+               goto out_stop;
+
+       rc = dt_record_write(&env, ofd->ofd_health_check_file,
+                            &info->fti_buf, &info->fti_off,
+                            th);
+
+out_stop:
+       rc1 = dt_trans_stop(&env, ofd->ofd_osd, th);
+       rc = rc ? rc : rc1;
 
        OBD_FREE(info->fti_buf.lb_buf, PAGE_SIZE);
 
        CDEBUG(D_INFO, "write 1 page synchronously for checking io rc %d\n",
               rc);
-#endif
+
 out:
        lu_env_fini(&env);
        return !!rc;
index d7b09bd..60460e3 100755 (executable)
@@ -10424,6 +10424,34 @@ test_69() {
 }
 run_test 69 "verify oa2dentry return -ENOENT doesn't LBUG ======"
 
+test_70a() {
+       # Perform a really simple test of health write
+       # and health check
+
+       local orig_value="$(do_facet ost1 $LCTL get_param -n enable_health_write)"
+
+       stack_trap "do_facet ost1 $LCTL set_param enable_health_write $orig_value"
+
+       # Test with health write off
+       do_facet ost1 $LCTL set_param enable_health_write off ||
+               error "can't set enable_health_write off"
+       do_facet ost1 $LCTL get_param enable_health_write ||
+               error "can't get enable_health_write"
+
+       [[ "$(do_facet ost1 $LCTL get_param health_check)" =~ "healthy" ]] ||
+               error "not healthy (1)"
+
+       # Test with health write on
+       do_facet ost1 $LCTL set_param enable_health_write on ||
+               error "can't set enable_health_write on"
+       do_facet ost1 $LCTL get_param enable_health_write ||
+               error "can't get enable_health_write"
+
+       [[ "$(do_facet ost1 $LCTL get_param health_check)" =~ "healthy" ]] ||
+               error "not healthy (2)"
+}
+run_test 70a "verify health_check, health_write don't explode (on OST)"
+
 test_71() {
        test_mkdir $DIR/$tdir
        $LFS setdirstripe -D -c$MDSCOUNT $DIR/$tdir