From be031767b14b53b4e0a577c3a45d0c471c5f89a9 Mon Sep 17 00:00:00 2001 From: bobijam Date: Tue, 19 Aug 2008 01:59:07 +0000 Subject: [PATCH] Branch HEAD b=16566 o=Jonathan Li(jli@cray.com) i=shadow, bobijam Description: Upcall on Lustre log has been dumped Details : Allow for a user mode script to be called once a Lustre log has been dumped. It passes the filename of the dumped log to the script, the location of the script can be specified via /proc/sys/lnet/debug_log_upcall. --- libcfs/libcfs/debug.c | 4 ++++ libcfs/libcfs/linux/linux-debug.c | 36 ++++++++++++++++++++++++++++++++++++ libcfs/libcfs/linux/linux-proc.c | 35 ++++++++++++++++++++++++----------- libcfs/libcfs/tracefile.h | 8 +++++--- lustre/ChangeLog | 14 +++++++++++--- lustre/include/obd_support.h | 2 ++ lustre/ptlrpc/service.c | 9 ++++++--- 7 files changed, 88 insertions(+), 20 deletions(-) diff --git a/libcfs/libcfs/debug.c b/libcfs/libcfs/debug.c index 3ae687d..41848c2 100644 --- a/libcfs/libcfs/debug.c +++ b/libcfs/libcfs/debug.c @@ -418,6 +418,9 @@ libcfs_debug_str2mask(int *mask, const char *str, int is_subsys) return 0; } +/** + * Dump Lustre log to ::debug_file_path by calling tracefile_dump_all_pages() + */ void libcfs_debug_dumplog_internal(void *arg) { CFS_DECL_JOURNAL_DATA; @@ -431,6 +434,7 @@ void libcfs_debug_dumplog_internal(void *arg) printk(KERN_ALERT "LustreError: dumping log to %s\n", debug_file_name); tracefile_dump_all_pages(debug_file_name); + libcfs_run_debug_log_upcall(debug_file_name); } CFS_POP_JOURNAL; } diff --git a/libcfs/libcfs/linux/linux-debug.c b/libcfs/libcfs/linux/linux-debug.c index c25e659..e405ceb 100644 --- a/libcfs/libcfs/linux/linux-debug.c +++ b/libcfs/libcfs/linux/linux-debug.c @@ -78,6 +78,42 @@ #endif char lnet_upcall[1024] = "/usr/lib/lustre/lnet_upcall"; +char lnet_debug_log_upcall[1024] = "/usr/lib/lustre/lnet_debug_log_upcall"; + +/** + * Upcall function once a Lustre log has been dumped. + * + * \param file path of the dumped log + */ +void libcfs_run_debug_log_upcall(char *file) +{ + char *argv[3]; + int rc; + char *envp[] = { + "HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL}; + ENTRY; + + argv[0] = lnet_debug_log_upcall; + + LASSERTF(file != NULL, "called on a null filename\n"); + argv[1] = file; //only need to pass the path of the file + + argv[2] = NULL; + + rc = USERMODEHELPER(argv[0], argv, envp); + if (rc < 0 && rc != -ENOENT) { + CERROR("Error %d invoking LNET debug log upcall %s %s; " + "check /proc/sys/lnet/debug_log_upcall\n", + rc, argv[0], argv[1]); + } else { + CDEBUG(D_HA, "Invoked LNET debug log upcall %s %s\n", + argv[0], argv[1]); + } + + EXIT; +} void libcfs_run_upcall(char **argv) { diff --git a/libcfs/libcfs/linux/linux-proc.c b/libcfs/libcfs/linux/linux-proc.c index e4580ff..a6b10f0 100644 --- a/libcfs/libcfs/linux/linux-proc.c +++ b/libcfs/libcfs/linux/linux-proc.c @@ -78,6 +78,10 @@ static cfs_sysctl_table_header_t *lnet_table_header = NULL; extern char lnet_upcall[1024]; +/** + * The path of debug log dump upcall script. + */ +extern char lnet_debug_log_upcall[1024]; #define PSDEV_LNET (0x100) enum { @@ -97,11 +101,12 @@ enum { PSDEV_LNET_DUMP_KERNEL, /* snapshot kernel debug buffer to file */ PSDEV_LNET_DAEMON_FILE, /* spool kernel debug buffer to file */ PSDEV_LNET_DEBUG_MB, /* size of debug buffer */ + PSDEV_LNET_DEBUG_LOG_UPCALL, /* debug log upcall script */ }; -static int -proc_call_handler(void *data, int write, - loff_t *ppos, void *buffer, size_t *lenp, +static int +proc_call_handler(void *data, int write, + loff_t *ppos, void *buffer, size_t *lenp, int (*handler)(void *data, int write, loff_t pos, void *buffer, int len)) { @@ -130,7 +135,7 @@ LL_PROC_PROTO(name) \ __##name); \ } -static int __proc_dobitmasks(void *data, int write, +static int __proc_dobitmasks(void *data, int write, loff_t pos, void *buffer, int nob) { const int tmpstrlen = 512; @@ -176,7 +181,7 @@ static int __proc_dump_kernel(void *data, int write, { if (!write) return 0; - + return trace_dump_debug_buffer_usrstr(buffer, nob); } @@ -187,14 +192,14 @@ static int __proc_daemon_file(void *data, int write, { if (!write) { int len = strlen(tracefile); - + if (pos >= len) return 0; - - return trace_copyout_string(buffer, nob, + + return trace_copyout_string(buffer, nob, tracefile + pos, "\n"); } - + return trace_daemon_command_usrstr(buffer, nob); } @@ -210,10 +215,10 @@ static int __proc_debug_mb(void *data, int write, if (pos >= len) return 0; - + return trace_copyout_string(buffer, nob, tmpstr + pos, "\n"); } - + return trace_set_debug_mb_usrstr(buffer, nob); } @@ -385,6 +390,14 @@ static cfs_sysctl_table_t lnet_table[] = { .proc_handler = &proc_dostring, }, { + .ctl_name = PSDEV_LNET_DEBUG_LOG_UPCALL, + .procname = "debug_log_upcall", + .data = lnet_debug_log_upcall, + .maxlen = sizeof(lnet_debug_log_upcall), + .mode = 0644, + .proc_handler = &proc_dostring, + }, + { .ctl_name = PSDEV_LNET_MEMUSED, .procname = "memused", .data = (int *)&libcfs_kmemory.counter, diff --git a/libcfs/libcfs/tracefile.h b/libcfs/libcfs/tracefile.h index 0493063..dd25327 100644 --- a/libcfs/libcfs/tracefile.h +++ b/libcfs/libcfs/tracefile.h @@ -45,6 +45,8 @@ extern char tracefile[TRACEFILE_NAME_SIZE]; extern long long tracefile_size; +extern void libcfs_run_debug_log_upcall(char *file); + int tracefile_init_arch(void); void tracefile_fini_arch(void); @@ -96,7 +98,7 @@ extern int trace_max_debug_mb(void); #define TRACEFILE_SIZE (500 << 20) -/* Size of a buffer for sprinting console messages if we can't get a page +/* Size of a buffer for sprinting console messages if we can't get a page * from system */ #define TRACE_CONSOLE_BUFFER_SIZE 1024 @@ -125,7 +127,7 @@ union trace_data_union { /* * Maximal number of pages allowed on ->tcd_pages and - * ->tcd_daemon_pages each. + * ->tcd_daemon_pages each. * Always TCD_MAX_PAGES * tcd_pages_factor / 100 in current * implementation. */ @@ -233,7 +235,7 @@ struct trace_page { */ unsigned short cpu; /* - * type(context) of this page + * type(context) of this page */ unsigned short type; }; diff --git a/lustre/ChangeLog b/lustre/ChangeLog index f286dd9..a3ce983 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -12,12 +12,20 @@ tbd Sun Microsystems, Inc. * RHEL 4 and RHEL 5/SLES 10 clients behaves differently on 'cd' to a removed cwd "./" (refer to Bugzilla 14399). +Severity : enhancement +Bugzilla : 16566 +Description: Upcall on Lustre log has been dumped +Details : Allow for a user mode script to be called once a Lustre log has + been dumped. It passes the filename of the dumped log to the + script, the location of the script can be specified via + /proc/sys/lnet/debug_log_upcall. + Severity : minor Bugzilla : 16583 Frequency : rare -Description: avoid messages about idr_remove called for id which is not allocated. -Details : Move assigment s_dev for clustered nfs to end of initialization, for avoid - problem with error handling. +Description: avoid messages about idr_remove called for id which is not allocated. +Details : Move assigment s_dev for clustered nfs to end of initialization, for + avoid problem with error handling. Severity : minor Bugzilla : 16109 diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index cda024f..4a9a82f 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -280,6 +280,8 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, #define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a #define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c +#define OBD_FAIL_PTLRPC_DUMP_LOG 0x50e + #define OBD_FAIL_OBD_PING_NET 0x600 #define OBD_FAIL_OBD_LOG_CANCEL_NET 0x601 #define OBD_FAIL_OBD_LOGD_NET 0x602 diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 2f6c814..c362aad 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -644,7 +644,7 @@ static void ptlrpc_at_set_timer(struct ptlrpc_service *svc) /* Set timer for closest deadline */ rq = list_entry(svc->srv_at_list.next, struct ptlrpc_request, rq_timed_list); - next = (__s32)(rq->rq_deadline - cfs_time_current_sec() - + next = (__s32)(rq->rq_deadline - cfs_time_current_sec() - at_early_margin); if (next <= 0) ptlrpc_at_timer((unsigned long)svc); @@ -1071,6 +1071,9 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc, spin_unlock(&svc->srv_lock); + if(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DUMP_LOG)) + libcfs_debug_dumplog(); + do_gettimeofday(&work_start); timediff = cfs_timeval_sub(&work_start, &request->rq_arrival_time,NULL); if (likely(svc->srv_stats != NULL)) { @@ -1619,7 +1622,7 @@ int ptlrpc_start_threads(struct obd_device *dev, struct ptlrpc_service *svc) if (rc == -EMFILE) break; if (rc) { - CERROR("cannot start %s thread #%d: rc %d\n", + CERROR("cannot start %s thread #%d: rc %d\n", svc->srv_thread_name, i, rc); ptlrpc_stop_all_threads(svc); } @@ -1667,7 +1670,7 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc) d.thread = thread; CDEBUG(D_RPCTRACE, "starting thread '%s'\n", name); - + /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we * just drop the VM and FILES in ptlrpc_daemonize() right away. */ -- 1.8.3.1