Whamcloud - gitweb
LU-17242 debug: use dump_stack() where possible 25/53625/2
authorTimothy Day <timday@amazon.com>
Tue, 9 Jan 2024 17:17:10 +0000 (17:17 +0000)
committerOleg Drokin <green@whamcloud.com>
Sun, 4 Feb 2024 08:31:25 +0000 (08:31 +0000)
In some cases, libcfs_debug_dumpstack() can fail to output a
stack trace - either because the needed symbols are not exported
or those symbols can't be resolved at runtime. This seems to
occur more often with newer kernels. The messages appears only
as:

 Lustre: ldlm_cb01_002: service thread pid 57876 was inactive for
   40.494 seconds. The thread might be hung, or it might only be
   slow and will resume later. Dumping the stack trace for
   debugging purposes:
 Pid: 57876, comm: ldlm_cb01_002 6.1.70 #1 SMP PREEMPT_DYNAMIC
   Thu Jan  4 18:52:41 UTC 2024
 Call Trace TBD:

with no stack trace (seen on CentOS 8.5 with ml 6.1.70).

For reference, the runtime symbol lookup was added and updated in:

 b49ce7a ("LU-12400 libcfs: save_stack_trace_tsk if ARCH_STACKWALK")
 58ac9d3 ("LU-14099 build: Fix for unconfigured arch_stackwalk")

First, add a message when the symbol can't be resolved correctly.
This makes it much easier to understand why the stack trace is
missing.

Second, replace libcfs_debug_dumpstack(NULL) with dump_stack().
When the task_struct is NULL, libcfs uses the current
task_struct. This replicates the functionality of dump_stack().
Using dump_stack() is more reliable, more in line with kernel
style, and not likely to be un-exported in the future.

Finally, in lustre/osc/osc_object.c the stack isn't dumped since
there is already an LBUG().

There only remains one user of libcfs_debug_dumpstack() which
uses a task_struct other than current. This can be cleaned up
in a future patch.

Test-Parameters: trivial
Signed-off-by: Timothy Day <timday@amazon.com>
Change-Id: I196c1da7e39b1a694c0cb67ecfaab58ab3e4662c
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53625
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: James Simmons <jsimmons@infradead.org>
Reviewed-by: Patrick Farrell <pfarrell@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
libcfs/libcfs/debug.c
libcfs/libcfs/linux/linux-prim.c
lustre/ldlm/ldlm_lock.c
lustre/ldlm/ldlm_lockd.c
lustre/mdc/mdc_dev.c
lustre/osc/osc_object.c
lustre/osd-ldiskfs/osd_handler.c
lustre/osd-ldiskfs/osd_internal.h

index 93bef6b..cb67d26 100644 (file)
@@ -461,7 +461,7 @@ void __noreturn lbug_with_loc(struct libcfs_debug_msg_data *msgdata)
                /* not reached */
        }
 
-       libcfs_debug_dumpstack(NULL);
+       dump_stack();
        if (libcfs_panic_on_lbug)
                panic("LBUG");
        else
index 660d328..3ad1694 100644 (file)
@@ -137,6 +137,9 @@ int cfs_stack_trace_save_tsk(struct task_struct *task, unsigned long *store,
 {
        if (task_dump_stack_t)
                return task_dump_stack_t(task, store, size, skipnr);
+
+       pr_info("No stack, save_stack_trace_tsk() could not be found\n");
+
        return 0;
 }
 #endif
index 1d62ddc..7c0a486 100644 (file)
@@ -1660,7 +1660,7 @@ int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill,
                break;
        default:
                LDLM_ERROR(lock, "Unknown LVB type: %d", lock->l_lvb_type);
-               libcfs_debug_dumpstack(NULL);
+               dump_stack();
                RETURN(-EINVAL);
        }
 
index 8651412..fabb82d 100644 (file)
@@ -469,7 +469,7 @@ static int ldlm_add_waiting_lock(struct ldlm_lock *lock, timeout_t timeout)
                LDLM_ERROR(lock, "not waiting on destroyed lock (b=5653)");
                if (ktime_get_seconds() > next) {
                        next = ktime_get_seconds() + 14400;
-                       libcfs_debug_dumpstack(NULL);
+                       dump_stack();
                }
                return 0;
        }
index 8374ca5..b4ad623 100644 (file)
@@ -1041,7 +1041,7 @@ static int mdc_get_lock_handle(const struct lu_env *env, struct osc_object *osc,
                                PFID(lu_object_fid(osc2lu(osc))));
                else
                        ldlm_resource_dump(D_ERROR, res);
-               libcfs_debug_dumpstack(NULL);
+               dump_stack();
                return -ENOENT;
        } else {
                *lh = lock->l_remote_handle;
index 026cb2d..d9b2290 100644 (file)
@@ -409,7 +409,6 @@ static void osc_req_attr_set(const struct lu_env *env, struct cl_object *obj,
                        else
                                ldlm_resource_dump(D_ERROR, res);
 
-                       libcfs_debug_dumpstack(NULL);
                        LBUG();
                }
 
index 46f20ba..a43a2da 100644 (file)
@@ -1769,7 +1769,7 @@ static void __osd_th_check_slow(void *oth, struct osd_device *dev,
        if (ktime_before(ktime_add_ns(alloced, 30 * NSEC_PER_SEC), now)) {
                CWARN("transaction handle %p was open for too long: now %lld, alloced %lld, started %lld, closed %lld\n",
                                oth, now, alloced, started, closed);
-               libcfs_debug_dumpstack(NULL);
+               dump_stack();
        }
 }
 
@@ -2049,7 +2049,7 @@ static int osd_trans_start(const struct lu_env *env, struct dt_device *d,
                        CWARN("%s: credits %u > trans_max %u\n", osd_name(dev),
                              oh->ot_credits, osd_transaction_size(dev));
                        osd_trans_dump_creds(env, th);
-                       libcfs_debug_dumpstack(NULL);
+                       dump_stack();
                        last_credits = oh->ot_credits;
                        last_printed = jiffies;
                }
index a762d94..bc3ceaa 100644 (file)
@@ -1287,7 +1287,7 @@ static inline void osd_trans_declare_op(const struct lu_env *env,
                } else {
                        CWARN("%s: Invalid operation index %d\n",
                              osd_name(osd_dt_dev(oh->ot_super.th_dev)), op);
-                       libcfs_debug_dumpstack(NULL);
+                       dump_stack();
                }
        } else {
                oti->oti_declare_ops[op]++;
@@ -1318,7 +1318,7 @@ static inline void osd_trans_exec_op(const struct lu_env *env,
                        CWARN("%s: opcode %u: invalid value >= %u\n",
                              osd_name(osd_dt_dev(oh->ot_super.th_dev)),
                              op, OSD_OT_MAX);
-                       libcfs_debug_dumpstack(NULL);
+                       dump_stack();
                        return;
                }
        }