Whamcloud - gitweb
LU-17242 debug: use dump_stack() where possible
authorTimothy Day <timday@amazon.com>
Tue, 9 Jan 2024 17:17:10 +0000 (17:17 +0000)
committerAndreas Dilger <adilger@whamcloud.com>
Thu, 6 Jun 2024 08:15:57 +0000 (08:15 +0000)
In some cases, libcfs_debug_dumpstack() can fail to output a
stack trace - either because the needed symbols are not exported
or those symbols can't be resolved at runtime. This seems to
occur more often with newer kernels. The messages appears only
as:

 Lustre: ldlm_cb01_002: service thread pid 57876 was inactive for
   40.494 seconds. The thread might be hung, or it might only be
   slow and will resume later. Dumping the stack trace for
   debugging purposes:
 Pid: 57876, comm: ldlm_cb01_002 6.1.70 #1 SMP PREEMPT_DYNAMIC
   Thu Jan  4 18:52:41 UTC 2024
 Call Trace TBD:

with no stack trace (seen on CentOS 8.5 with ml 6.1.70).

For reference, the runtime symbol lookup was added and updated in:

 b49ce7a ("LU-12400 libcfs: save_stack_trace_tsk if ARCH_STACKWALK")
 58ac9d3 ("LU-14099 build: Fix for unconfigured arch_stackwalk")

First, add a message when the symbol can't be resolved correctly.
This makes it much easier to understand why the stack trace is
missing.

Second, replace libcfs_debug_dumpstack(NULL) with dump_stack().
When the task_struct is NULL, libcfs uses the current
task_struct. This replicates the functionality of dump_stack().
Using dump_stack() is more reliable, more in line with kernel
style, and not likely to be un-exported in the future.

Finally, in lustre/osc/osc_object.c the stack isn't dumped since
there is already an LBUG().

There only remains one user of libcfs_debug_dumpstack() which
uses a task_struct other than current. This can be cleaned up
in a future patch.

Lustre-change: https://review.whamcloud.com/53625
Lustre-commit: ecac0c175d934fd5624c9ad8db8f45dbc33fb56c

Test-Parameters: trivial
Signed-off-by: Timothy Day <timday@amazon.com>
Change-Id: I196c1da7e39b1a694c0cb67ecfaab58ab3e4662c
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: James Simmons <jsimmons@infradead.org>
Reviewed-by: Patrick Farrell <pfarrell@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/55239
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Sebastien Buisson <sbuisson@ddn.com>
libcfs/libcfs/debug.c
lustre/ldlm/ldlm_lock.c
lustre/ldlm/ldlm_lockd.c
lustre/mdc/mdc_dev.c
lustre/osc/osc_object.c
lustre/osd-ldiskfs/osd_handler.c
lustre/osd-ldiskfs/osd_internal.h

index 42901f3..ec56f48 100644 (file)
@@ -466,7 +466,7 @@ void __noreturn lbug_with_loc(struct libcfs_debug_msg_data *msgdata)
                /* not reached */
        }
 
-       libcfs_debug_dumpstack(NULL);
+       dump_stack();
        if (libcfs_panic_on_lbug)
                panic("LBUG");
        else
index 04409df..f90202a 100644 (file)
@@ -1646,7 +1646,7 @@ int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill,
                break;
        default:
                LDLM_ERROR(lock, "Unknown LVB type: %d", lock->l_lvb_type);
-               libcfs_debug_dumpstack(NULL);
+               dump_stack();
                RETURN(-EINVAL);
        }
 
index 5ae1b8f..de0b0d9 100644 (file)
@@ -465,7 +465,7 @@ static int ldlm_add_waiting_lock(struct ldlm_lock *lock, timeout_t timeout)
                LDLM_ERROR(lock, "not waiting on destroyed lock (b=5653)");
                if (ktime_get_seconds() > next) {
                        next = ktime_get_seconds() + 14400;
-                       libcfs_debug_dumpstack(NULL);
+                       dump_stack();
                }
                return 0;
        }
index fb81e30..2fcf28d 100644 (file)
@@ -1037,7 +1037,7 @@ static int mdc_get_lock_handle(const struct lu_env *env, struct osc_object *osc,
                                PFID(lu_object_fid(osc2lu(osc))));
                else
                        ldlm_resource_dump(D_ERROR, res);
-               libcfs_debug_dumpstack(NULL);
+               dump_stack();
                return -ENOENT;
        } else {
                *lh = lock->l_remote_handle;
index f9147df..9c52b0a 100644 (file)
@@ -416,7 +416,6 @@ static void osc_req_attr_set(const struct lu_env *env, struct cl_object *obj,
                        else
                                ldlm_resource_dump(D_ERROR, res);
 
-                       libcfs_debug_dumpstack(NULL);
                        LBUG();
                }
 
index 1654861..1659ef7 100644 (file)
@@ -1770,7 +1770,7 @@ static void __osd_th_check_slow(void *oth, struct osd_device *dev,
        if (ktime_before(ktime_add_ns(alloced, 30 * NSEC_PER_SEC), now)) {
                CWARN("transaction handle %p was open for too long: now %lld, alloced %lld, started %lld, closed %lld\n",
                                oth, now, alloced, started, closed);
-               libcfs_debug_dumpstack(NULL);
+               dump_stack();
        }
 }
 
@@ -2071,7 +2071,7 @@ static int osd_trans_start(const struct lu_env *env, struct dt_device *d,
                        CWARN("%s: credits %u > trans_max %u\n", osd_name(dev),
                              oh->ot_credits, osd_transaction_size(dev));
                        osd_trans_dump_creds(env, th);
-                       libcfs_debug_dumpstack(NULL);
+                       dump_stack();
                        last_credits = oh->ot_credits;
                        last_printed = jiffies;
                }
index b4d2078..e2fe001 100644 (file)
@@ -1321,7 +1321,7 @@ static inline void osd_trans_declare_op(const struct lu_env *env,
                } else {
                        CWARN("%s: Invalid operation index %d\n",
                              osd_name(osd_dt_dev(oh->ot_super.th_dev)), op);
-                       libcfs_debug_dumpstack(NULL);
+                       dump_stack();
                }
        } else {
                oti->oti_declare_ops[op]++;
@@ -1352,7 +1352,7 @@ static inline void osd_trans_exec_op(const struct lu_env *env,
                        CWARN("%s: opcode %u: invalid value >= %u\n",
                              osd_name(osd_dt_dev(oh->ot_super.th_dev)),
                              op, OSD_OT_MAX);
-                       libcfs_debug_dumpstack(NULL);
+                       dump_stack();
                        return;
                }
        }