Whamcloud - gitweb
LU-17812 ldlm: stack trace log for LDLM error 96/54896/18
authorRajeev Mishra <rajeevm@hpe.com>
Tue, 26 Mar 2024 02:15:31 +0000 (02:15 +0000)
committerOleg Drokin <green@whamcloud.com>
Tue, 23 Jul 2024 04:41:10 +0000 (04:41 +0000)
Added support to dump the stack trace in
ldlm_lock_debug(), the stack trace is logged only
for the case of D_ERROR and and when dump_stack_on_error
is enabled

Test-Parameters: testlist=sanity env=ONLY=105g
HPE-bug-id: LUS-12165
Signed-off-by: Rajeev Mishra <rajeevm@hpe.com>
Change-Id: I4ce280334e0273df1751257e8db03ea680831696
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/54896
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Reviewed-by: Petros Koutoupis <petros.koutoupis@hpe.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lustre/include/lustre_dlm.h
lustre/include/obd_support.h
lustre/ldlm/ldlm_flock.c
lustre/ldlm/ldlm_resource.c
lustre/tests/sanity.sh

index 2f40d6c..98c80d6 100644 (file)
@@ -571,6 +571,12 @@ struct ldlm_namespace {
         */
        unsigned int            ns_stopping:1,
 
+
+       /**
+        * This namespace will control the stack trace log in ldlm_lock_debug
+        */
+                               ns_dump_stack_on_error:1,
+
        /**
         * Flag to indicate the LRU recalc on RPC reply is in progress.
         * Used to limit the process by 1 thread only.
@@ -1317,10 +1323,14 @@ extern const char *ldlm_it2str(enum ldlm_intent_flags it);
  */
 #ifdef LIBCFS_DEBUG
 #define ldlm_lock_debug(msgdata, mask, cdls, lock, fmt, a...) do {      \
-       if (((mask) & D_CANTMASK) != 0 ||                               \
+       if (((mask) & D_CANTMASK) != 0 ||                \
            ((libcfs_debug & (mask)) != 0 &&                            \
-            (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))          \
+            (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) {        \
                _ldlm_lock_debug(lock, msgdata, fmt, ##a);              \
+               if (ldlm_lock_to_ns(lock)->ns_dump_stack_on_error &&    \
+                                       (mask) & D_ERROR)             \
+                       dump_stack();                                   \
+       }                                                               \
 } while (0)
 
 __printf(3, 4) /* function attribute */
index 74a2245..0f37442 100644 (file)
@@ -414,6 +414,7 @@ extern bool obd_enable_health_write;
 #define OBD_FAIL_LDLM_LOCAL_CANCEL_PAUSE 0x32c
 #define OBD_FAIL_LDLM_LOCK_REPLAY       0x32d
 #define OBD_FAIL_LDLM_REPLAY_PAUSE      0x32e
+#define OBD_FAIL_LDLM_LOCK_STACK         0x32f
 
 /* LOCKLESS IO */
 #define OBD_FAIL_LDLM_SET_CONTENTION     0x385
index 07056bf..5474971 100644 (file)
@@ -671,6 +671,8 @@ ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
 
        ENTRY;
 
+       if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_LOCK_STACK))
+               LDLM_ERROR(lock, "Test ldlm error stack");
        CFS_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT2, 4);
        if (CFS_FAIL_PRECHECK(OBD_FAIL_LDLM_CP_CB_WAIT3)) {
                lock_res_and_lock(lock);
index 0f11d25..5131797 100644 (file)
@@ -532,6 +532,38 @@ static ssize_t dirty_age_limit_store(struct kobject *kobj,
 }
 LUSTRE_RW_ATTR(dirty_age_limit);
 
+static ssize_t dump_stack_on_error_show(struct kobject *kobj,
+                                    struct attribute *attr, char *buf)
+{
+       struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+                                                ns_kobj);
+
+       return snprintf(buf, sizeof(buf) - 1, "%u\n",
+                               ns->ns_dump_stack_on_error);
+}
+
+static ssize_t dump_stack_on_error_store(struct kobject *kobj,
+                                     struct attribute *attr,
+                                     const char *buffer, size_t count)
+{
+       struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace,
+                                                ns_kobj);
+       bool tmp;
+       int err;
+
+       err = kstrtobool(buffer, &tmp);
+       if (err != 0)
+               return -EINVAL;
+
+       if (tmp != 0)
+               ns->ns_dump_stack_on_error = 1;
+       else
+               ns->ns_dump_stack_on_error = 0;
+
+       return count;
+}
+LUSTRE_RW_ATTR(dump_stack_on_error);
+
 #ifdef HAVE_SERVER_SUPPORT
 static ssize_t ctime_age_limit_show(struct kobject *kobj,
                                    struct attribute *attr, char *buf)
@@ -692,6 +724,7 @@ static struct attribute *ldlm_ns_attrs[] = {
        &lustre_attr_lru_max_age.attr,
        &lustre_attr_early_lock_cancel.attr,
        &lustre_attr_dirty_age_limit.attr,
+       &lustre_attr_dump_stack_on_error.attr,
 #ifdef HAVE_SERVER_SUPPORT
        &lustre_attr_ctime_age_limit.attr,
        &lustre_attr_lock_timeouts.attr,
@@ -970,25 +1003,26 @@ struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
        atomic_set(&ns->ns_bref, 0);
        init_waitqueue_head(&ns->ns_waitq);
 
-       ns->ns_max_nolock_size    = NS_DEFAULT_MAX_NOLOCK_BYTES;
-       ns->ns_contention_time    = NS_DEFAULT_CONTENTION_SECONDS;
-       ns->ns_contended_locks    = NS_DEFAULT_CONTENDED_LOCKS;
-
-       ns->ns_max_parallel_ast   = LDLM_DEFAULT_PARALLEL_AST_LIMIT;
-       ns->ns_nr_unused          = 0;
-       ns->ns_max_unused         = LDLM_DEFAULT_LRU_SIZE;
-       ns->ns_cancel_batch       = LDLM_DEFAULT_LRU_SHRINK_BATCH;
-       ns->ns_recalc_pct         = LDLM_DEFAULT_SLV_RECALC_PCT;
-       ns->ns_max_age            = ktime_set(LDLM_DEFAULT_MAX_ALIVE, 0);
-       ns->ns_ctime_age_limit    = LDLM_CTIME_AGE_LIMIT;
-       ns->ns_dirty_age_limit    = ktime_set(LDLM_DIRTY_AGE_LIMIT, 0);
-       ns->ns_timeouts           = 0;
-       ns->ns_orig_connect_flags = 0;
-       ns->ns_connect_flags      = 0;
-       ns->ns_stopping           = 0;
-       ns->ns_reclaim_start      = 0;
-       ns->ns_last_pos           = &ns->ns_unused_list;
-       ns->ns_flags              = 0;
+       ns->ns_max_nolock_size          = NS_DEFAULT_MAX_NOLOCK_BYTES;
+       ns->ns_contention_time          = NS_DEFAULT_CONTENTION_SECONDS;
+       ns->ns_contended_locks          = NS_DEFAULT_CONTENDED_LOCKS;
+
+       ns->ns_max_parallel_ast         = LDLM_DEFAULT_PARALLEL_AST_LIMIT;
+       ns->ns_nr_unused                = 0;
+       ns->ns_max_unused               = LDLM_DEFAULT_LRU_SIZE;
+       ns->ns_cancel_batch             = LDLM_DEFAULT_LRU_SHRINK_BATCH;
+       ns->ns_recalc_pct               = LDLM_DEFAULT_SLV_RECALC_PCT;
+       ns->ns_max_age                  = ktime_set(LDLM_DEFAULT_MAX_ALIVE, 0);
+       ns->ns_ctime_age_limit          = LDLM_CTIME_AGE_LIMIT;
+       ns->ns_dirty_age_limit          = ktime_set(LDLM_DIRTY_AGE_LIMIT, 0);
+       ns->ns_timeouts                 = 0;
+       ns->ns_orig_connect_flags       = 0;
+       ns->ns_connect_flags            = 0;
+       ns->ns_stopping                 = 0;
+       ns->ns_dump_stack_on_error      = 0;
+       ns->ns_reclaim_start            = 0;
+       ns->ns_last_pos                 = &ns->ns_unused_list;
+       ns->ns_flags                    = 0;
 
        rc = ldlm_namespace_sysfs_register(ns);
        if (rc) {
index ff59e9d..d6276c6 100755 (executable)
@@ -13300,6 +13300,25 @@ test_105f() {
 }
 run_test 105f "Enqueue same range flocks"
 
+test_105g() {
+
+       (( $CLIENT_VERSION >= $(version_code 2.15.63.127) )) ||
+               skip "Need Client >= 2.15.63.127 for ldlm dump stack"
+
+       flock_is_enabled || skip_env "mount w/o flock enabled"
+       mkdir $DIR/$tdir
+       #define OBD_FAIL_LDLM_LOCK_STACK  0x32f
+       do_facet client $LCTL set_param -n \
+               ldlm.namespaces.*.dump_stack_on_error=1
+       stack_trap "do_facet client $LCTL set_param -n \
+               ldlm.namespaces.*.dump_stack_on_error=0"
+       $LCTL set_param fail_loc=0x8000032f
+       flocks_test 2 $DIR/$tdir
+       dmesg | tac | sed "/$(echo $TESTNAME | tr '_' ' ')/,$ d" |
+               grep -q "dump_stack" || error "didn't find dump_stack"
+}
+run_test 105g "ldlm_lock_debug stack test"
+
 test_106() { #bug 10921
        test_mkdir $DIR/$tdir
        $DIR/$tdir && error "exec $DIR/$tdir succeeded"