From ecac0c175d934fd5624c9ad8db8f45dbc33fb56c Mon Sep 17 00:00:00 2001 From: Timothy Day Date: Tue, 9 Jan 2024 17:17:10 +0000 Subject: [PATCH] LU-17242 debug: use dump_stack() where possible In some cases, libcfs_debug_dumpstack() can fail to output a stack trace - either because the needed symbols are not exported or those symbols can't be resolved at runtime. This seems to occur more often with newer kernels. The messages appears only as: Lustre: ldlm_cb01_002: service thread pid 57876 was inactive for 40.494 seconds. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: Pid: 57876, comm: ldlm_cb01_002 6.1.70 #1 SMP PREEMPT_DYNAMIC Thu Jan 4 18:52:41 UTC 2024 Call Trace TBD: with no stack trace (seen on CentOS 8.5 with ml 6.1.70). For reference, the runtime symbol lookup was added and updated in: b49ce7a ("LU-12400 libcfs: save_stack_trace_tsk if ARCH_STACKWALK") 58ac9d3 ("LU-14099 build: Fix for unconfigured arch_stackwalk") First, add a message when the symbol can't be resolved correctly. This makes it much easier to understand why the stack trace is missing. Second, replace libcfs_debug_dumpstack(NULL) with dump_stack(). When the task_struct is NULL, libcfs uses the current task_struct. This replicates the functionality of dump_stack(). Using dump_stack() is more reliable, more in line with kernel style, and not likely to be un-exported in the future. Finally, in lustre/osc/osc_object.c the stack isn't dumped since there is already an LBUG(). There only remains one user of libcfs_debug_dumpstack() which uses a task_struct other than current. This can be cleaned up in a future patch. Test-Parameters: trivial Signed-off-by: Timothy Day Change-Id: I196c1da7e39b1a694c0cb67ecfaab58ab3e4662c Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53625 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: James Simmons Reviewed-by: Patrick Farrell Reviewed-by: Oleg Drokin --- libcfs/libcfs/debug.c | 2 +- libcfs/libcfs/linux/linux-prim.c | 3 +++ lustre/ldlm/ldlm_lock.c | 2 +- lustre/ldlm/ldlm_lockd.c | 2 +- lustre/mdc/mdc_dev.c | 2 +- lustre/osc/osc_object.c | 1 - lustre/osd-ldiskfs/osd_handler.c | 4 ++-- lustre/osd-ldiskfs/osd_internal.h | 4 ++-- 8 files changed, 11 insertions(+), 9 deletions(-) diff --git a/libcfs/libcfs/debug.c b/libcfs/libcfs/debug.c index 93bef6b..cb67d26 100644 --- a/libcfs/libcfs/debug.c +++ b/libcfs/libcfs/debug.c @@ -461,7 +461,7 @@ void __noreturn lbug_with_loc(struct libcfs_debug_msg_data *msgdata) /* not reached */ } - libcfs_debug_dumpstack(NULL); + dump_stack(); if (libcfs_panic_on_lbug) panic("LBUG"); else diff --git a/libcfs/libcfs/linux/linux-prim.c b/libcfs/libcfs/linux/linux-prim.c index 660d328..3ad1694 100644 --- a/libcfs/libcfs/linux/linux-prim.c +++ b/libcfs/libcfs/linux/linux-prim.c @@ -137,6 +137,9 @@ int cfs_stack_trace_save_tsk(struct task_struct *task, unsigned long *store, { if (task_dump_stack_t) return task_dump_stack_t(task, store, size, skipnr); + + pr_info("No stack, save_stack_trace_tsk() could not be found\n"); + return 0; } #endif diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index 1d62ddc..7c0a486 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -1660,7 +1660,7 @@ int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill, break; default: LDLM_ERROR(lock, "Unknown LVB type: %d", lock->l_lvb_type); - libcfs_debug_dumpstack(NULL); + dump_stack(); RETURN(-EINVAL); } diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 8651412..fabb82d2 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -469,7 +469,7 @@ static int ldlm_add_waiting_lock(struct ldlm_lock *lock, timeout_t timeout) LDLM_ERROR(lock, "not waiting on destroyed lock (b=5653)"); if (ktime_get_seconds() > next) { next = ktime_get_seconds() + 14400; - libcfs_debug_dumpstack(NULL); + dump_stack(); } return 0; } diff --git a/lustre/mdc/mdc_dev.c b/lustre/mdc/mdc_dev.c index 8374ca5..b4ad623 100644 --- a/lustre/mdc/mdc_dev.c +++ b/lustre/mdc/mdc_dev.c @@ -1041,7 +1041,7 @@ static int mdc_get_lock_handle(const struct lu_env *env, struct osc_object *osc, PFID(lu_object_fid(osc2lu(osc)))); else ldlm_resource_dump(D_ERROR, res); - libcfs_debug_dumpstack(NULL); + dump_stack(); return -ENOENT; } else { *lh = lock->l_remote_handle; diff --git a/lustre/osc/osc_object.c b/lustre/osc/osc_object.c index 026cb2d..d9b2290 100644 --- a/lustre/osc/osc_object.c +++ b/lustre/osc/osc_object.c @@ -409,7 +409,6 @@ static void osc_req_attr_set(const struct lu_env *env, struct cl_object *obj, else ldlm_resource_dump(D_ERROR, res); - libcfs_debug_dumpstack(NULL); LBUG(); } diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index 46f20ba..a43a2da 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -1769,7 +1769,7 @@ static void __osd_th_check_slow(void *oth, struct osd_device *dev, if (ktime_before(ktime_add_ns(alloced, 30 * NSEC_PER_SEC), now)) { CWARN("transaction handle %p was open for too long: now %lld, alloced %lld, started %lld, closed %lld\n", oth, now, alloced, started, closed); - libcfs_debug_dumpstack(NULL); + dump_stack(); } } @@ -2049,7 +2049,7 @@ static int osd_trans_start(const struct lu_env *env, struct dt_device *d, CWARN("%s: credits %u > trans_max %u\n", osd_name(dev), oh->ot_credits, osd_transaction_size(dev)); osd_trans_dump_creds(env, th); - libcfs_debug_dumpstack(NULL); + dump_stack(); last_credits = oh->ot_credits; last_printed = jiffies; } diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h index a762d94..bc3ceaa 100644 --- a/lustre/osd-ldiskfs/osd_internal.h +++ b/lustre/osd-ldiskfs/osd_internal.h @@ -1287,7 +1287,7 @@ static inline void osd_trans_declare_op(const struct lu_env *env, } else { CWARN("%s: Invalid operation index %d\n", osd_name(osd_dt_dev(oh->ot_super.th_dev)), op); - libcfs_debug_dumpstack(NULL); + dump_stack(); } } else { oti->oti_declare_ops[op]++; @@ -1318,7 +1318,7 @@ static inline void osd_trans_exec_op(const struct lu_env *env, CWARN("%s: opcode %u: invalid value >= %u\n", osd_name(osd_dt_dev(oh->ot_super.th_dev)), op, OSD_OT_MAX); - libcfs_debug_dumpstack(NULL); + dump_stack(); return; } } -- 1.8.3.1