From 70b9dc57a99d5ce1769a41d32c29cd3fe8fb074a Mon Sep 17 00:00:00 2001 From: Rajeev Mishra Date: Tue, 26 Mar 2024 02:15:31 +0000 Subject: [PATCH] LU-17812 ldlm: stack trace log for LDLM error Added support to dump the stack trace in ldlm_lock_debug(), the stack trace is logged only for the case of D_ERROR and and when dump_stack_on_error is enabled Test-Parameters: testlist=sanity env=ONLY=105g HPE-bug-id: LUS-12165 Signed-off-by: Rajeev Mishra Change-Id: I4ce280334e0273df1751257e8db03ea680831696 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/54896 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin Reviewed-by: Petros Koutoupis Reviewed-by: Andreas Dilger --- lustre/include/lustre_dlm.h | 14 +++++++-- lustre/include/obd_support.h | 1 + lustre/ldlm/ldlm_flock.c | 2 ++ lustre/ldlm/ldlm_resource.c | 72 ++++++++++++++++++++++++++++++++------------ lustre/tests/sanity.sh | 19 ++++++++++++ 5 files changed, 87 insertions(+), 21 deletions(-) diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h index 2f40d6c..98c80d6 100644 --- a/lustre/include/lustre_dlm.h +++ b/lustre/include/lustre_dlm.h @@ -571,6 +571,12 @@ struct ldlm_namespace { */ unsigned int ns_stopping:1, + + /** + * This namespace will control the stack trace log in ldlm_lock_debug + */ + ns_dump_stack_on_error:1, + /** * Flag to indicate the LRU recalc on RPC reply is in progress. * Used to limit the process by 1 thread only. @@ -1317,10 +1323,14 @@ extern const char *ldlm_it2str(enum ldlm_intent_flags it); */ #ifdef LIBCFS_DEBUG #define ldlm_lock_debug(msgdata, mask, cdls, lock, fmt, a...) do { \ - if (((mask) & D_CANTMASK) != 0 || \ + if (((mask) & D_CANTMASK) != 0 || \ ((libcfs_debug & (mask)) != 0 && \ - (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \ + (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) { \ _ldlm_lock_debug(lock, msgdata, fmt, ##a); \ + if (ldlm_lock_to_ns(lock)->ns_dump_stack_on_error && \ + (mask) & D_ERROR) \ + dump_stack(); \ + } \ } while (0) __printf(3, 4) /* function attribute */ diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 74a2245..0f37442 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -414,6 +414,7 @@ extern bool obd_enable_health_write; #define OBD_FAIL_LDLM_LOCAL_CANCEL_PAUSE 0x32c #define OBD_FAIL_LDLM_LOCK_REPLAY 0x32d #define OBD_FAIL_LDLM_REPLAY_PAUSE 0x32e +#define OBD_FAIL_LDLM_LOCK_STACK 0x32f /* LOCKLESS IO */ #define OBD_FAIL_LDLM_SET_CONTENTION 0x385 diff --git a/lustre/ldlm/ldlm_flock.c b/lustre/ldlm/ldlm_flock.c index 07056bf..5474971 100644 --- a/lustre/ldlm/ldlm_flock.c +++ b/lustre/ldlm/ldlm_flock.c @@ -671,6 +671,8 @@ ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data) ENTRY; + if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_LOCK_STACK)) + LDLM_ERROR(lock, "Test ldlm error stack"); CFS_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT2, 4); if (CFS_FAIL_PRECHECK(OBD_FAIL_LDLM_CP_CB_WAIT3)) { lock_res_and_lock(lock); diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c index 0f11d25..5131797 100644 --- a/lustre/ldlm/ldlm_resource.c +++ b/lustre/ldlm/ldlm_resource.c @@ -532,6 +532,38 @@ static ssize_t dirty_age_limit_store(struct kobject *kobj, } LUSTRE_RW_ATTR(dirty_age_limit); +static ssize_t dump_stack_on_error_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + + return snprintf(buf, sizeof(buf) - 1, "%u\n", + ns->ns_dump_stack_on_error); +} + +static ssize_t dump_stack_on_error_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, + ns_kobj); + bool tmp; + int err; + + err = kstrtobool(buffer, &tmp); + if (err != 0) + return -EINVAL; + + if (tmp != 0) + ns->ns_dump_stack_on_error = 1; + else + ns->ns_dump_stack_on_error = 0; + + return count; +} +LUSTRE_RW_ATTR(dump_stack_on_error); + #ifdef HAVE_SERVER_SUPPORT static ssize_t ctime_age_limit_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -692,6 +724,7 @@ static struct attribute *ldlm_ns_attrs[] = { &lustre_attr_lru_max_age.attr, &lustre_attr_early_lock_cancel.attr, &lustre_attr_dirty_age_limit.attr, + &lustre_attr_dump_stack_on_error.attr, #ifdef HAVE_SERVER_SUPPORT &lustre_attr_ctime_age_limit.attr, &lustre_attr_lock_timeouts.attr, @@ -970,25 +1003,26 @@ struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name, atomic_set(&ns->ns_bref, 0); init_waitqueue_head(&ns->ns_waitq); - ns->ns_max_nolock_size = NS_DEFAULT_MAX_NOLOCK_BYTES; - ns->ns_contention_time = NS_DEFAULT_CONTENTION_SECONDS; - ns->ns_contended_locks = NS_DEFAULT_CONTENDED_LOCKS; - - ns->ns_max_parallel_ast = LDLM_DEFAULT_PARALLEL_AST_LIMIT; - ns->ns_nr_unused = 0; - ns->ns_max_unused = LDLM_DEFAULT_LRU_SIZE; - ns->ns_cancel_batch = LDLM_DEFAULT_LRU_SHRINK_BATCH; - ns->ns_recalc_pct = LDLM_DEFAULT_SLV_RECALC_PCT; - ns->ns_max_age = ktime_set(LDLM_DEFAULT_MAX_ALIVE, 0); - ns->ns_ctime_age_limit = LDLM_CTIME_AGE_LIMIT; - ns->ns_dirty_age_limit = ktime_set(LDLM_DIRTY_AGE_LIMIT, 0); - ns->ns_timeouts = 0; - ns->ns_orig_connect_flags = 0; - ns->ns_connect_flags = 0; - ns->ns_stopping = 0; - ns->ns_reclaim_start = 0; - ns->ns_last_pos = &ns->ns_unused_list; - ns->ns_flags = 0; + ns->ns_max_nolock_size = NS_DEFAULT_MAX_NOLOCK_BYTES; + ns->ns_contention_time = NS_DEFAULT_CONTENTION_SECONDS; + ns->ns_contended_locks = NS_DEFAULT_CONTENDED_LOCKS; + + ns->ns_max_parallel_ast = LDLM_DEFAULT_PARALLEL_AST_LIMIT; + ns->ns_nr_unused = 0; + ns->ns_max_unused = LDLM_DEFAULT_LRU_SIZE; + ns->ns_cancel_batch = LDLM_DEFAULT_LRU_SHRINK_BATCH; + ns->ns_recalc_pct = LDLM_DEFAULT_SLV_RECALC_PCT; + ns->ns_max_age = ktime_set(LDLM_DEFAULT_MAX_ALIVE, 0); + ns->ns_ctime_age_limit = LDLM_CTIME_AGE_LIMIT; + ns->ns_dirty_age_limit = ktime_set(LDLM_DIRTY_AGE_LIMIT, 0); + ns->ns_timeouts = 0; + ns->ns_orig_connect_flags = 0; + ns->ns_connect_flags = 0; + ns->ns_stopping = 0; + ns->ns_dump_stack_on_error = 0; + ns->ns_reclaim_start = 0; + ns->ns_last_pos = &ns->ns_unused_list; + ns->ns_flags = 0; rc = ldlm_namespace_sysfs_register(ns); if (rc) { diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index ff59e9d..d6276c6 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -13300,6 +13300,25 @@ test_105f() { } run_test 105f "Enqueue same range flocks" +test_105g() { + + (( $CLIENT_VERSION >= $(version_code 2.15.63.127) )) || + skip "Need Client >= 2.15.63.127 for ldlm dump stack" + + flock_is_enabled || skip_env "mount w/o flock enabled" + mkdir $DIR/$tdir + #define OBD_FAIL_LDLM_LOCK_STACK 0x32f + do_facet client $LCTL set_param -n \ + ldlm.namespaces.*.dump_stack_on_error=1 + stack_trap "do_facet client $LCTL set_param -n \ + ldlm.namespaces.*.dump_stack_on_error=0" + $LCTL set_param fail_loc=0x8000032f + flocks_test 2 $DIR/$tdir + dmesg | tac | sed "/$(echo $TESTNAME | tr '_' ' ')/,$ d" | + grep -q "dump_stack" || error "didn't find dump_stack" +} +run_test 105g "ldlm_lock_debug stack test" + test_106() { #bug 10921 test_mkdir $DIR/$tdir $DIR/$tdir && error "exec $DIR/$tdir succeeded" -- 1.8.3.1