X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=libcfs%2Flibcfs%2Fwatchdog.c;h=2c6d67e749c9515350c9f58ee5dc6ac958cfbd47;hb=d38d331fa6525ffc02665f48fa52f94626360631;hp=855bf7b10e085fabdee5f2b7c4dcf3bed4954240;hpb=bb3475b47a6bd908f5e0483bb1434937604f6418;p=fs%2Flustre-release.git diff --git a/libcfs/libcfs/watchdog.c b/libcfs/libcfs/watchdog.c index 855bf7b..2c6d67e 100644 --- a/libcfs/libcfs/watchdog.c +++ b/libcfs/libcfs/watchdog.c @@ -1,6 +1,4 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * +/* * GPL HEADER START * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -26,8 +24,10 @@ * GPL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -44,16 +44,16 @@ #include "tracefile.h" struct lc_watchdog { - cfs_timer_t lcw_timer; /* kernel timer */ - cfs_list_t lcw_list; - cfs_time_t lcw_last_touched; - cfs_task_t *lcw_task; - cfs_atomic_t lcw_refcount; + spinlock_t lcw_lock; /* check or change lcw_list */ + int lcw_refcount; /* must hold lcw_pending_timers_lock */ + cfs_timer_t lcw_timer; /* kernel timer */ + cfs_list_t lcw_list; /* chain on pending list */ + cfs_time_t lcw_last_touched; /* last touched stamp */ + cfs_task_t *lcw_task; /* owner task */ + void (*lcw_callback)(pid_t, void *); + void *lcw_data; - void (*lcw_callback)(pid_t, void *); - void *lcw_data; - - pid_t lcw_pid; + pid_t lcw_pid; enum { LC_WATCHDOG_DISABLED, @@ -68,8 +68,8 @@ struct lc_watchdog { * and lcw_stop_completion when it exits. * Wake lcw_event_waitq to signal timer callback dispatches. */ -static cfs_completion_t lcw_start_completion; -static cfs_completion_t lcw_stop_completion; +static struct completion lcw_start_completion; +static struct completion lcw_stop_completion; static cfs_waitq_t lcw_event_waitq; /* @@ -83,19 +83,18 @@ static unsigned long lcw_flags = 0; /* * Number of outstanding watchdogs. * When it hits 1, we start the dispatcher. - * When it hits 0, we stop the distpatcher. + * When it hits 0, we stop the dispatcher. */ static __u32 lcw_refcount = 0; -static CFS_DECLARE_MUTEX(lcw_refcount_sem); +static DEFINE_MUTEX(lcw_refcount_mutex); /* * List of timers that have fired that need their callbacks run by the * dispatcher. */ /* BH lock! */ -static cfs_spinlock_t lcw_pending_timers_lock = CFS_SPIN_LOCK_UNLOCKED; -static cfs_list_t lcw_pending_timers = \ - CFS_LIST_HEAD_INIT(lcw_pending_timers); +static DEFINE_SPINLOCK(lcw_pending_timers_lock); +static cfs_list_t lcw_pending_timers = CFS_LIST_HEAD_INIT(lcw_pending_timers); /* Last time a watchdog expired */ static cfs_time_t lcw_last_watchdog_time; @@ -105,15 +104,8 @@ static void lcw_dump(struct lc_watchdog *lcw) { ENTRY; -#if defined(HAVE_TASKLIST_LOCK) - cfs_read_lock(&tasklist_lock); -#elif defined(HAVE_TASK_RCU) rcu_read_lock(); -#else - CERROR("unable to dump stack because of missing export\n"); - RETURN_EXIT; -#endif - if (lcw->lcw_task == NULL) { + if (lcw->lcw_task == NULL) { LCONSOLE_WARN("Process " LPPID " was not found in the task " "list; watchdog callback may be incomplete\n", (int)lcw->lcw_pid); @@ -121,11 +113,7 @@ lcw_dump(struct lc_watchdog *lcw) libcfs_debug_dumpstack(lcw->lcw_task); } -#if defined(HAVE_TASKLIST_LOCK) - cfs_read_unlock(&tasklist_lock); -#elif defined(HAVE_TASK_RCU) rcu_read_unlock(); -#endif EXIT; } @@ -141,38 +129,30 @@ static void lcw_cb(ulong_ptr_t data) lcw->lcw_state = LC_WATCHDOG_EXPIRED; - cfs_spin_lock_bh(&lcw_pending_timers_lock); - cfs_list_add(&lcw->lcw_list, &lcw_pending_timers); - cfs_waitq_signal(&lcw_event_waitq); - cfs_spin_unlock_bh(&lcw_pending_timers_lock); + spin_lock_bh(&lcw->lcw_lock); + LASSERT(cfs_list_empty(&lcw->lcw_list)); - EXIT; -} + spin_lock_bh(&lcw_pending_timers_lock); + lcw->lcw_refcount++; /* +1 for pending list */ + cfs_list_add(&lcw->lcw_list, &lcw_pending_timers); + cfs_waitq_signal(&lcw_event_waitq); -static inline void lcw_get(struct lc_watchdog *lcw) -{ - cfs_atomic_inc(&lcw->lcw_refcount); -} - -static inline void lcw_put(struct lc_watchdog *lcw) -{ - if (cfs_atomic_dec_and_test(&lcw->lcw_refcount)) { - LASSERT(cfs_list_empty(&lcw->lcw_list)); - LIBCFS_FREE(lcw, sizeof(*lcw)); - } + spin_unlock_bh(&lcw_pending_timers_lock); + spin_unlock_bh(&lcw->lcw_lock); + EXIT; } static int is_watchdog_fired(void) { - int rc; + int rc; - if (cfs_test_bit(LCW_FLAG_STOP, &lcw_flags)) - return 1; + if (test_bit(LCW_FLAG_STOP, &lcw_flags)) + return 1; - cfs_spin_lock_bh(&lcw_pending_timers_lock); - rc = !cfs_list_empty(&lcw_pending_timers); - cfs_spin_unlock_bh(&lcw_pending_timers_lock); - return rc; + spin_lock_bh(&lcw_pending_timers_lock); + rc = !cfs_list_empty(&lcw_pending_timers); + spin_unlock_bh(&lcw_pending_timers_lock); + return rc; } static void lcw_dump_stack(struct lc_watchdog *lcw) @@ -225,112 +205,135 @@ static void lcw_dump_stack(struct lc_watchdog *lcw) static int lcw_dispatch_main(void *data) { int rc = 0; - unsigned long flags; - struct lc_watchdog *lcw, *lcwcb; + struct lc_watchdog *lcw; + CFS_LIST_HEAD (zombies); ENTRY; - cfs_daemonize("lc_watchdogd"); - - SIGNAL_MASK_LOCK(current, flags); - sigfillset(¤t->blocked); - RECALC_SIGPENDING; - SIGNAL_MASK_UNLOCK(current, flags); - - cfs_complete(&lcw_start_completion); + complete(&lcw_start_completion); while (1) { - cfs_wait_event_interruptible(lcw_event_waitq, - is_watchdog_fired(), rc); - CDEBUG(D_INFO, "Watchdog got woken up...\n"); - if (cfs_test_bit(LCW_FLAG_STOP, &lcw_flags)) { - CDEBUG(D_INFO, "LCW_FLAG_STOP was set, shutting down...\n"); - - cfs_spin_lock_bh(&lcw_pending_timers_lock); - rc = !cfs_list_empty(&lcw_pending_timers); - cfs_spin_unlock_bh(&lcw_pending_timers_lock); - if (rc) { - CERROR("pending timers list was not empty at " - "time of watchdog dispatch shutdown\n"); - } - break; - } + int dumplog = 1; - lcwcb = NULL; - cfs_spin_lock_bh(&lcw_pending_timers_lock); + rc = wait_event_interruptible(lcw_event_waitq, + is_watchdog_fired()); + CDEBUG(D_INFO, "Watchdog got woken up...\n"); + if (test_bit(LCW_FLAG_STOP, &lcw_flags)) { + CDEBUG(D_INFO, "LCW_FLAG_STOP set, shutting down...\n"); + + spin_lock_bh(&lcw_pending_timers_lock); + rc = !cfs_list_empty(&lcw_pending_timers); + spin_unlock_bh(&lcw_pending_timers_lock); + if (rc) { + CERROR("pending timers list was not empty at " + "time of watchdog dispatch shutdown\n"); + } + break; + } + + spin_lock_bh(&lcw_pending_timers_lock); while (!cfs_list_empty(&lcw_pending_timers)) { + int is_dumplog; lcw = cfs_list_entry(lcw_pending_timers.next, - struct lc_watchdog, - lcw_list); - lcw_get(lcw); + struct lc_watchdog, lcw_list); + /* +1 ref for callback to make sure lwc wouldn't be + * deleted after releasing lcw_pending_timers_lock */ + lcw->lcw_refcount++; + spin_unlock_bh(&lcw_pending_timers_lock); + + /* lock ordering */ + spin_lock_bh(&lcw->lcw_lock); + spin_lock_bh(&lcw_pending_timers_lock); + + if (cfs_list_empty(&lcw->lcw_list)) { + /* already removed from pending list */ + lcw->lcw_refcount--; /* -1 ref for callback */ + if (lcw->lcw_refcount == 0) + cfs_list_add(&lcw->lcw_list, &zombies); + spin_unlock_bh(&lcw->lcw_lock); + /* still hold lcw_pending_timers_lock */ + continue; + } + cfs_list_del_init(&lcw->lcw_list); - cfs_spin_unlock_bh(&lcw_pending_timers_lock); + lcw->lcw_refcount--; /* -1 ref for pending list */ + + spin_unlock_bh(&lcw_pending_timers_lock); + spin_unlock_bh(&lcw->lcw_lock); CDEBUG(D_INFO, "found lcw for pid " LPPID "\n", lcw->lcw_pid); lcw_dump_stack(lcw); - if (lcwcb == NULL && - lcw->lcw_state != LC_WATCHDOG_DISABLED) - lcwcb = lcw; - else - lcw_put(lcw); - cfs_spin_lock_bh(&lcw_pending_timers_lock); - } - cfs_spin_unlock_bh(&lcw_pending_timers_lock); + is_dumplog = lcw->lcw_callback == lc_watchdog_dumplog; + if (lcw->lcw_state != LC_WATCHDOG_DISABLED && + (dumplog || !is_dumplog)) { + lcw->lcw_callback(lcw->lcw_pid, lcw->lcw_data); + if (dumplog && is_dumplog) + dumplog = 0; + } - /* only do callback once for this batch of lcws */ - if (lcwcb != NULL) { - lcwcb->lcw_callback(lcwcb->lcw_pid, lcwcb->lcw_data); - lcw_put(lcwcb); + spin_lock_bh(&lcw_pending_timers_lock); + lcw->lcw_refcount--; /* -1 ref for callback */ + if (lcw->lcw_refcount == 0) + cfs_list_add(&lcw->lcw_list, &zombies); + } + spin_unlock_bh(&lcw_pending_timers_lock); + + while (!cfs_list_empty(&zombies)) { + lcw = cfs_list_entry(lcw_pending_timers.next, + struct lc_watchdog, lcw_list); + cfs_list_del(&lcw->lcw_list); + LIBCFS_FREE(lcw, sizeof(*lcw)); } } - cfs_complete(&lcw_stop_completion); + complete(&lcw_stop_completion); - RETURN(rc); + RETURN(rc); } static void lcw_dispatch_start(void) { - int rc; + cfs_task_t *task; - ENTRY; - LASSERT(lcw_refcount == 1); + ENTRY; + LASSERT(lcw_refcount == 1); - cfs_init_completion(&lcw_stop_completion); - cfs_init_completion(&lcw_start_completion); + init_completion(&lcw_stop_completion); + init_completion(&lcw_start_completion); cfs_waitq_init(&lcw_event_waitq); - CDEBUG(D_INFO, "starting dispatch thread\n"); - rc = cfs_kernel_thread(lcw_dispatch_main, NULL, 0); - if (rc < 0) { - CERROR("error spawning watchdog dispatch thread: %d\n", rc); - EXIT; - return; - } - cfs_wait_for_completion(&lcw_start_completion); - CDEBUG(D_INFO, "watchdog dispatcher initialization complete.\n"); - - EXIT; + CDEBUG(D_INFO, "starting dispatch thread\n"); + task = kthread_run(lcw_dispatch_main, NULL, "lc_watchdogd"); + if (IS_ERR(task)) { + CERROR("error spawning watchdog dispatch thread: %ld\n", + PTR_ERR(task)); + EXIT; + return; + } + wait_for_completion(&lcw_start_completion); + CDEBUG(D_INFO, "watchdog dispatcher initialization complete.\n"); + + EXIT; } static void lcw_dispatch_stop(void) { - ENTRY; - LASSERT(lcw_refcount == 0); + ENTRY; + LASSERT(lcw_refcount == 0); - CDEBUG(D_INFO, "trying to stop watchdog dispatcher.\n"); + CDEBUG(D_INFO, "trying to stop watchdog dispatcher.\n"); - cfs_set_bit(LCW_FLAG_STOP, &lcw_flags); - cfs_waitq_signal(&lcw_event_waitq); + set_bit(LCW_FLAG_STOP, &lcw_flags); + cfs_waitq_signal(&lcw_event_waitq); - cfs_wait_for_completion(&lcw_stop_completion); + wait_for_completion(&lcw_stop_completion); - CDEBUG(D_INFO, "watchdog dispatcher has shut down.\n"); + CDEBUG(D_INFO, "watchdog dispatcher has shut down.\n"); - EXIT; + EXIT; } struct lc_watchdog *lc_watchdog_add(int timeout, @@ -346,6 +349,8 @@ struct lc_watchdog *lc_watchdog_add(int timeout, RETURN(ERR_PTR(-ENOMEM)); } + spin_lock_init(&lcw->lcw_lock); + lcw->lcw_refcount = 1; /* refcount for owner */ lcw->lcw_task = cfs_current(); lcw->lcw_pid = cfs_curproc_pid(); lcw->lcw_callback = (callback != NULL) ? callback : lc_watchdog_dumplog; @@ -354,12 +359,11 @@ struct lc_watchdog *lc_watchdog_add(int timeout, CFS_INIT_LIST_HEAD(&lcw->lcw_list); cfs_timer_init(&lcw->lcw_timer, lcw_cb, lcw); - cfs_atomic_set(&lcw->lcw_refcount, 1); - cfs_down(&lcw_refcount_sem); - if (++lcw_refcount == 1) - lcw_dispatch_start(); - cfs_up(&lcw_refcount_sem); + mutex_lock(&lcw_refcount_mutex); + if (++lcw_refcount == 1) + lcw_dispatch_start(); + mutex_unlock(&lcw_refcount_mutex); /* Keep this working in case we enable them by default */ if (lcw->lcw_state == LC_WATCHDOG_ENABLED) { @@ -394,15 +398,25 @@ static void lcw_update_time(struct lc_watchdog *lcw, const char *message) lcw->lcw_last_touched = newtime; } +static void lc_watchdog_del_pending(struct lc_watchdog *lcw) +{ + spin_lock_bh(&lcw->lcw_lock); + if (unlikely(!cfs_list_empty(&lcw->lcw_list))) { + spin_lock_bh(&lcw_pending_timers_lock); + cfs_list_del_init(&lcw->lcw_list); + lcw->lcw_refcount--; /* -1 ref for pending list */ + spin_unlock_bh(&lcw_pending_timers_lock); + } + + spin_unlock_bh(&lcw->lcw_lock); +} + void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout) { ENTRY; LASSERT(lcw != NULL); - LASSERT(cfs_atomic_read(&lcw->lcw_refcount) > 0); - cfs_spin_lock_bh(&lcw_pending_timers_lock); - cfs_list_del_init(&lcw->lcw_list); - cfs_spin_unlock_bh(&lcw_pending_timers_lock); + lc_watchdog_del_pending(lcw); lcw_update_time(lcw, "resumed"); lcw->lcw_state = LC_WATCHDOG_ENABLED; @@ -418,12 +432,8 @@ void lc_watchdog_disable(struct lc_watchdog *lcw) { ENTRY; LASSERT(lcw != NULL); - LASSERT(cfs_atomic_read(&lcw->lcw_refcount) > 0); - cfs_spin_lock_bh(&lcw_pending_timers_lock); - if (!cfs_list_empty(&lcw->lcw_list)) - cfs_list_del_init(&lcw->lcw_list); - cfs_spin_unlock_bh(&lcw_pending_timers_lock); + lc_watchdog_del_pending(lcw); lcw_update_time(lcw, "completed"); lcw->lcw_state = LC_WATCHDOG_DISABLED; @@ -434,26 +444,36 @@ EXPORT_SYMBOL(lc_watchdog_disable); void lc_watchdog_delete(struct lc_watchdog *lcw) { + int dead; + ENTRY; LASSERT(lcw != NULL); - LASSERT(cfs_atomic_read(&lcw->lcw_refcount) > 0); cfs_timer_disarm(&lcw->lcw_timer); lcw_update_time(lcw, "stopped"); - cfs_spin_lock_bh(&lcw_pending_timers_lock); - if (!cfs_list_empty(&lcw->lcw_list)) - cfs_list_del_init(&lcw->lcw_list); - cfs_spin_unlock_bh(&lcw_pending_timers_lock); - lcw_put(lcw); + spin_lock_bh(&lcw->lcw_lock); + spin_lock_bh(&lcw_pending_timers_lock); + if (unlikely(!cfs_list_empty(&lcw->lcw_list))) { + cfs_list_del_init(&lcw->lcw_list); + lcw->lcw_refcount--; /* -1 ref for pending list */ + } - cfs_down(&lcw_refcount_sem); - if (--lcw_refcount == 0) - lcw_dispatch_stop(); - cfs_up(&lcw_refcount_sem); + lcw->lcw_refcount--; /* -1 ref for owner */ + dead = lcw->lcw_refcount == 0; + spin_unlock_bh(&lcw_pending_timers_lock); + spin_unlock_bh(&lcw->lcw_lock); - EXIT; + if (dead) + LIBCFS_FREE(lcw, sizeof(*lcw)); + + mutex_lock(&lcw_refcount_mutex); + if (--lcw_refcount == 0) + lcw_dispatch_stop(); + mutex_unlock(&lcw_refcount_mutex); + + EXIT; } EXPORT_SYMBOL(lc_watchdog_delete);