Whamcloud - gitweb
LU-9679 osc: convert cl_cache_waiters to a wait_queue. 75/38575/4
authorOleg Drokin <green@whamcloud.com>
Tue, 12 May 2020 00:05:32 +0000 (20:05 -0400)
committerOleg Drokin <green@whamcloud.com>
Sat, 23 May 2020 19:56:59 +0000 (19:56 +0000)
cli->cl_cache_waiters is a list of tasks that need
to be woken when grant-space becomes available.  This
means it is acting much like a wait queue.
So let's change it to really be a wait queue.

The current implementation adds new waiters to the end of the list,
and calls osc_enter_cache_try() on each in order.  We can provide the
same behaviour by using an exclusive wait, and having each waiter wake
the next task when it succeeds.

If a waiter notices that success has become impossible, it wakes all
other waiters.

If a waiter times out, it doesn't wake other - just leaves them to
time out themselves.

Note that the old code handled -EINTR from the wait function.  That is
not a possible return value when wait_event_idle* is used, so that
case is discarded.

As we need wait_event_idle_exclusive_timeout_cmd(), we should fix the
bug in that macro - the "might_sleep()" is wrong, as a spinlock might
be held at that point.

Linux-Commit: 31f45f56ecdf ("lustre: osc_cache: convert
cl_cache_waiters to a wait_queue.")

Lustre-change: https://review.whamcloud.com/37605
Lustre-commit: b2ede01d1ed77ddc512c013220f6ea8b509e9541

Change-Id: Ib7622ea2daea8f6e59bef95d3b6c5a80d209b81e
Signed-off-by: Mr NeilBrown <neilb@suse.com>
Reviewed-on: https://review.whamcloud.com/38575
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: James Simmons <jsimmons@infradead.org>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
libcfs/include/libcfs/linux/linux-wait.h
lustre/include/lustre_osc.h
lustre/include/obd.h
lustre/ldlm/ldlm_lib.c
lustre/osc/osc_cache.c
lustre/osc/osc_internal.h
lustre/osc/osc_page.c

index a497dce..fd154ba 100644 (file)
@@ -2,6 +2,8 @@
 #ifndef __LIBCFS_LINUX_WAIT_BIT_H
 #define __LIBCFS_LINUX_WAIT_BIT_H
 
+/* Make sure we can see if we have TASK_NOLOAD */
+#include <linux/sched.h>
 /*
  * Linux wait-bit related types and methods:
  */
@@ -31,6 +33,18 @@ extern long prepare_to_wait_event(wait_queue_head_t *wq_head,
                                  wait_queue_entry_t *wq_entry, int state);
 #endif
 
+/* ___wait_cond_timeout changed number of args in v3.12-rc1-78-g35a2af94c7ce
+ * so let's define our own ___wait_cond_timeout1
+ */
+
+#define ___wait_cond_timeout1(condition)                               \
+({                                                                     \
+       bool __cond = (condition);                                      \
+       if (__cond && !__ret)                                           \
+               __ret = 1;                                              \
+       __cond || !__ret;                                               \
+})
+
 #ifndef HAVE_CLEAR_AND_WAKE_UP_BIT
 /**
  * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit
@@ -110,7 +124,7 @@ do {                                                                        \
 })
 
 #define __wait_var_event_timeout(var, condition, timeout)              \
-       ___wait_var_event(var, ___wait_cond_timeout(condition),         \
+       ___wait_var_event(var, ___wait_cond_timeout1(condition),        \
                          TASK_UNINTERRUPTIBLE, 0, timeout,             \
                          __ret = schedule_timeout(__ret))
 
@@ -118,10 +132,437 @@ do {                                                                     \
 ({                                                                     \
        long __ret = timeout;                                           \
        might_sleep();                                                  \
-       if (!___wait_cond_timeout(condition))                           \
+       if (!___wait_cond_timeout1(condition))                          \
                __ret = __wait_var_event_timeout(var, condition, timeout); \
        __ret;                                                          \
 })
 #endif /* ! HAVE_WAIT_VAR_EVENT */
 
+/*
+ * prepare_to_wait_event() does not support an exclusive
+ * lifo wait.
+ * However it will not relink the wait_queue_entry if
+ * it is already linked.  So we link to the head of the
+ * queue here, and it will stay there.
+ */
+static inline void prepare_to_wait_exclusive_head(
+       wait_queue_head_t *waitq, wait_queue_entry_t *link)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&(waitq->lock), flags);
+#ifdef HAVE_WAIT_QUEUE_ENTRY_LIST
+       if (list_empty(&link->entry))
+#else
+       if (list_empty(&link->task_list))
+#endif
+               __add_wait_queue_exclusive(waitq, link);
+       spin_unlock_irqrestore(&((waitq)->lock), flags);
+}
+
+#ifndef ___wait_event
+/*
+ * The below macro ___wait_event() has an explicit shadow of the __ret
+ * variable when used from the wait_event_*() macros.
+ *
+ * This is so that both can use the ___wait_cond_timeout1() construct
+ * to wrap the condition.
+ *
+ * The type inconsistency of the wait_event_*() __ret variable is also
+ * on purpose; we use long where we can return timeout values and int
+ * otherwise.
+ */
+
+#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd)  \
+({                                                                     \
+       __label__ __out;                                                \
+       wait_queue_entry_ __wq_entry;                                   \
+       long __ret = ret;       /* explicit shadow */                   \
+                                                                       \
+       init_wait(&__wq_entry);                                         \
+       if (exclusive)                                                  \
+               __wq_entry.flags = WQ_FLAG_EXCLUSIVE                    \
+       for (;;) {                                                      \
+               long __int = prepare_to_wait_event(&wq_head,            \
+                                                 &__wq_entry, state);  \
+                                                                       \
+               if (condition)                                          \
+                       break;                                          \
+                                                                       \
+               if (___wait_is_interruptible(state) && __int) {         \
+                       __ret = __int;                                  \
+                       goto __out;                                     \
+               }                                                       \
+                                                                       \
+               cmd;                                                    \
+       }                                                               \
+       finish_wait(&wq_head, &__wq_entry);                             \
+__out: __ret;                                                          \
+})
+#endif
+
+#ifndef TASK_NOLOAD
+
+#define ___wait_event_idle(wq_head, condition, exclusive, ret, cmd)    \
+({                                                                     \
+       wait_queue_entry_t __wq_entry;                                  \
+       unsigned long flags;                                            \
+       long __ret = ret;       /* explicit shadow */                   \
+       sigset_t __blocked;                                             \
+                                                                       \
+       __blocked = cfs_block_sigsinv(0);                               \
+       init_wait(&__wq_entry);                                         \
+       if (exclusive)                                                  \
+               __wq_entry.flags = WQ_FLAG_EXCLUSIVE;                   \
+       for (;;) {                                                      \
+               prepare_to_wait_event(&wq_head,                         \
+                                  &__wq_entry,                         \
+                                  TASK_INTERRUPTIBLE);                 \
+                                                                       \
+               if (condition)                                          \
+                       break;                                          \
+               /* We have to do this here because some signals */      \
+               /* are not blockable - ie from strace(1).       */      \
+               /* In these cases we want to schedule_timeout() */      \
+               /* again, because we don't want that to return  */      \
+               /* -EINTR when the RPC actually succeeded.      */      \
+               /* the recalc_sigpending() below will deliver the */    \
+               /* signal properly.                             */      \
+               if (signal_pending(current)) {                          \
+                       spin_lock_irqsave(&current->sighand->siglock,   \
+                                         flags);                       \
+                       clear_tsk_thread_flag(current, TIF_SIGPENDING); \
+                       spin_unlock_irqrestore(&current->sighand->siglock,\
+                                              flags);                  \
+               }                                                       \
+               cmd;                                                    \
+       }                                                               \
+       finish_wait(&wq_head, &__wq_entry);                             \
+       cfs_restore_sigs(__blocked);                                    \
+       __ret;                                                          \
+})
+
+#define wait_event_idle(wq_head, condition)                            \
+do {                                                                   \
+       might_sleep();                                                  \
+       if (!(condition))                                               \
+               ___wait_event_idle(wq_head, condition, 0, 0, schedule());\
+} while (0)
+
+#define wait_event_idle_exclusive(wq_head, condition)                  \
+do {                                                                   \
+       might_sleep();                                                  \
+       if (!(condition))                                               \
+               ___wait_event_idle(wq_head, condition, 1, 0, schedule());\
+} while (0)
+
+#define __wait_event_idle_exclusive_timeout(wq_head, condition, timeout)\
+       ___wait_event_idle(wq_head, ___wait_cond_timeout1(condition),   \
+                          1, timeout,                                  \
+                          __ret = schedule_timeout(__ret))
+
+#define wait_event_idle_exclusive_timeout(wq_head, condition, timeout) \
+({                                                                     \
+       long __ret = timeout;                                           \
+       might_sleep();                                                  \
+       if (!___wait_cond_timeout1(condition))                          \
+               __ret = __wait_event_idle_exclusive_timeout(            \
+                       wq_head, condition, timeout);                   \
+       __ret;                                                          \
+})
+
+#define __wait_event_idle_exclusive_timeout_cmd(wq_head, condition,    \
+                                               timeout, cmd1, cmd2)    \
+       ___wait_event_idle(wq_head, ___wait_cond_timeout1(condition),   \
+                          1, timeout,                                  \
+                          cmd1; __ret = schedule_timeout(__ret); cmd2)
+
+#define wait_event_idle_exclusive_timeout_cmd(wq_head, condition, timeout,\
+                                             cmd1, cmd2)               \
+({                                                                     \
+       long __ret = timeout;                                           \
+       if (!___wait_cond_timeout1(condition))                          \
+               __ret = __wait_event_idle_exclusive_timeout_cmd(        \
+                       wq_head, condition, timeout, cmd1, cmd2);       \
+       __ret;                                                          \
+})
+
+#define __wait_event_idle_timeout(wq_head, condition, timeout)         \
+       ___wait_event_idle(wq_head, ___wait_cond_timeout1(condition),   \
+                          0, timeout,                                  \
+                          __ret = schedule_timeout(__ret))
+
+#define wait_event_idle_timeout(wq_head, condition, timeout)           \
+({                                                                     \
+       long __ret = timeout;                                           \
+       might_sleep();                                                  \
+       if (!___wait_cond_timeout1(condition))                          \
+               __ret = __wait_event_idle_timeout(wq_head, condition,   \
+                                                 timeout);             \
+       __ret;                                                          \
+})
+
+#else /* TASK_IDLE */
+#ifndef wait_event_idle
+/**
+ * wait_event_idle - wait for a condition without contributing to system load
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_IDLE) until the
+ * @condition evaluates to true.
+ * The @condition is checked each time the waitqueue @wq_head is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ */
+#define wait_event_idle(wq_head, condition)                            \
+do {                                                                   \
+       might_sleep();                                                  \
+       if (!(condition))                                               \
+               ___wait_event(wq_head, condition, TASK_IDLE, 0, 0,      \
+                             schedule());                              \
+} while (0)
+#endif
+#ifndef wait_event_idle_exclusive
+/**
+ * wait_event_idle_exclusive - wait for a condition without contributing to
+ *               system load
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_IDLE) until the
+ * @condition evaluates to true.
+ * The @condition is checked each time the waitqueue @wq_head is woken up.
+ *
+ * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
+ * set thus if other processes wait on the same list, when this
+ * process is woken further processes are not considered.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ */
+#define wait_event_idle_exclusive(wq_head, condition)                  \
+do {                                                                   \
+       might_sleep();                                                  \
+       if (!(condition))                                               \
+               ___wait_event(wq_head, condition, TASK_IDLE, 1, 0,      \
+                             schedule());                              \
+} while (0)
+#endif
+#ifndef wait_event_idle_exclusive_timeout
+/**
+ * wait_event_idle_exclusive_timeout - sleep without load until a condition
+ *                       becomes true or a timeout elapses
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, in jiffies
+ *
+ * The process is put to sleep (TASK_IDLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq_head is woken up.
+ *
+ * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
+ * set thus if other processes wait on the same list, when this
+ * process is woken further processes are not considered.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * Returns:
+ * 0 if the @condition evaluated to %false after the @timeout elapsed,
+ * 1 if the @condition evaluated to %true after the @timeout elapsed,
+ * or the remaining jiffies (at least 1) if the @condition evaluated
+ * to %true before the @timeout elapsed.
+ */
+#define wait_event_idle_exclusive_timeout(wq_head, condition, timeout) \
+({                                                                     \
+       long __ret = timeout;                                           \
+       might_sleep();                                                  \
+       if (!___wait_cond_timeout1(condition))                          \
+               __ret = __wait_event_idle_exclusive_timeout(wq_head,    \
+                                                           condition,  \
+                                                           timeout);   \
+       __ret;                                                          \
+})
+#endif
+#ifndef wait_event_idle_exclusive_timeout_cmd
+#define __wait_event_idle_exclusive_timeout_cmd(wq_head, condition,    \
+                                               timeout, cmd1, cmd2)    \
+       ___wait_event(wq_head, ___wait_cond_timeout1(condition),        \
+                     TASK_IDLE, 1, timeout,                            \
+                     cmd1; __ret = schedule_timeout(__ret); cmd2)
+
+#define wait_event_idle_exclusive_timeout_cmd(wq_head, condition, timeout,\
+                                             cmd1, cmd2)               \
+({                                                                     \
+       long __ret = timeout;                                           \
+       if (!___wait_cond_timeout1(condition))                          \
+               __ret = __wait_event_idle_exclusive_timeout_cmd(        \
+                       wq_head, condition, timeout, cmd1, cmd2);       \
+       __ret;                                                          \
+})
+#endif
+
+#ifndef wait_event_idle_timeout
+
+#define __wait_event_idle_timeout(wq_head, condition, timeout)         \
+       ___wait_event(wq_head, ___wait_cond_timeout1(condition),        \
+                     TASK_IDLE, 0, timeout,                            \
+                     __ret = schedule_timeout(__ret))
+
+/**
+ * wait_event_idle_timeout - sleep without load until a condition becomes
+ *                           true or a timeout elapses
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, in jiffies
+ *
+ * The process is put to sleep (TASK_IDLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq_head is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * Returns:
+ * 0 if the @condition evaluated to %false after the @timeout elapsed,
+ * 1 if the @condition evaluated to %true after the @timeout elapsed,
+ * or the remaining jiffies (at least 1) if the @condition evaluated
+ * to %true before the @timeout elapsed.
+ */
+#define wait_event_idle_timeout(wq_head, condition, timeout)           \
+({                                                                     \
+       long __ret = timeout;                                           \
+       might_sleep();                                                  \
+       if (!___wait_cond_timeout1(condition))                          \
+               __ret = __wait_event_idle_timeout(wq_head, condition,   \
+                                                 timeout);             \
+       __ret;                                                          \
+})
+#endif
+#endif /* TASK_IDLE */
+
+/* ___wait_event_lifo is used for lifo exclusive 'idle' waits */
+#ifdef TASK_NOLOAD
+
+#define ___wait_event_lifo(wq_head, condition, ret, cmd)               \
+({                                                                     \
+       wait_queue_entry_t       __wq_entry;                            \
+       long __ret = ret;       /* explicit shadow */                   \
+                                                                       \
+       init_wait(&__wq_entry);                                         \
+       __wq_entry.flags =  WQ_FLAG_EXCLUSIVE;                          \
+       for (;;) {                                                      \
+               prepare_to_wait_exclusive_head(&wq_head, &__wq_entry);  \
+               prepare_to_wait_event(&wq_head, &__wq_entry, TASK_IDLE);\
+                                                                       \
+               if (condition)                                          \
+                       break;                                          \
+                                                                       \
+               cmd;                                                    \
+       }                                                               \
+       finish_wait(&wq_head, &__wq_entry);                             \
+       __ret;                                                          \
+})
+#else
+#define ___wait_event_lifo(wq_head, condition, ret, cmd)               \
+({                                                                     \
+       wait_queue_entry_t __wq_entry;                                  \
+       unsigned long flags;                                            \
+       long __ret = ret;       /* explicit shadow */                   \
+       sigset_t __blocked;                                             \
+                                                                       \
+       __blocked = cfs_block_sigsinv(0);                               \
+       init_wait(&__wq_entry);                                         \
+       __wq_entry.flags = WQ_FLAG_EXCLUSIVE;                           \
+       for (;;) {                                                      \
+               prepare_to_wait_exclusive_head(&wq_head, &__wq_entry);  \
+               prepare_to_wait_event(&wq_head, &__wq_entry,            \
+                                     TASK_INTERRUPTIBLE);              \
+                                                                       \
+               if (condition)                                          \
+                       break;                                          \
+               /* See justification in ___wait_event_idle */           \
+               if (signal_pending(current)) {                          \
+                       spin_lock_irqsave(&current->sighand->siglock,   \
+                                         flags);                       \
+                       clear_tsk_thread_flag(current, TIF_SIGPENDING); \
+                       spin_unlock_irqrestore(&current->sighand->siglock,\
+                                              flags);                  \
+               }                                                       \
+               cmd;                                                    \
+       }                                                               \
+       cfs_restore_sigs(__blocked);                                    \
+       finish_wait(&wq_head, &__wq_entry);                             \
+       __ret;                                                          \
+})
+#endif
+
+#define wait_event_idle_exclusive_lifo(wq_head, condition)             \
+do {                                                                   \
+       might_sleep();                                                  \
+       if (!(condition))                                               \
+               ___wait_event_lifo(wq_head, condition, 0, schedule());  \
+} while (0)
+
+#define __wait_event_idle_lifo_timeout(wq_head, condition, timeout)    \
+       ___wait_event_lifo(wq_head, ___wait_cond_timeout1(condition),   \
+                          timeout,                                     \
+                          __ret = schedule_timeout(__ret))
+
+#define wait_event_idle_exclusive_lifo_timeout(wq_head, condition, timeout)\
+({                                                                     \
+       long __ret = timeout;                                           \
+       might_sleep();                                                  \
+       if (!___wait_cond_timeout1(condition))                          \
+               __ret = __wait_event_idle_lifo_timeout(wq_head,         \
+                                                      condition,       \
+                                                      timeout);        \
+       __ret;                                                          \
+})
+
+/* l_wait_event_abortable() is a bit like wait_event_killable()
+ * except there is a fixed set of signals which will abort:
+ * LUSTRE_FATAL_SIGS
+ */
+#define LUSTRE_FATAL_SIGS                                       \
+       (sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGTERM) | \
+        sigmask(SIGQUIT) | sigmask(SIGALRM))
+
+#define l_wait_event_abortable(wq, condition)                          \
+({                                                                     \
+       sigset_t __new_blocked, __old_blocked;                          \
+       int __ret = 0;                                                  \
+       siginitsetinv(&__new_blocked, LUSTRE_FATAL_SIGS);               \
+       sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked);         \
+       __ret = wait_event_interruptible(wq, condition);                \
+       sigprocmask(SIG_SETMASK, &__old_blocked, NULL);                 \
+       __ret;                                                          \
+})
+
+#define l_wait_event_abortable_timeout(wq, condition, timeout)         \
+({                                                                     \
+       sigset_t __new_blocked, __old_blocked;                          \
+       int __ret = 0;                                                  \
+       siginitsetinv(&__new_blocked, LUSTRE_FATAL_SIGS);               \
+       sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked);         \
+       __ret = wait_event_interruptible_timeout(wq, condition, timeout);\
+       sigprocmask(SIG_SETMASK, &__old_blocked, NULL);                 \
+       __ret;                                                          \
+})
+
+#define l_wait_event_abortable_exclusive(wq, condition)                        \
+({                                                                     \
+       sigset_t __new_blocked, __old_blocked;                          \
+       int __ret = 0;                                                  \
+       siginitsetinv(&__new_blocked, LUSTRE_FATAL_SIGS);               \
+       sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked);         \
+       __ret = wait_event_interruptible_exclusive(wq, condition);      \
+       sigprocmask(SIG_SETMASK, &__old_blocked, NULL);                 \
+       __ret;                                                          \
+})
+
 #endif /* __LICBFS_LINUX_WAIT_BIT_H */
index 32a2c63..c53cba8 100644 (file)
@@ -100,14 +100,6 @@ static inline struct osc_async_page *brw_page2oap(struct brw_page *pga)
        return container_of(pga, struct osc_async_page, oap_brw_page);
 }
 
-struct osc_cache_waiter {
-       struct list_head        ocw_entry;
-       wait_queue_head_t       ocw_waitq;
-       struct osc_async_page   *ocw_oap;
-       int                     ocw_grant;
-       int                     ocw_rc;
-};
-
 struct osc_device {
        struct cl_device        od_cl;
        struct obd_export       *od_exp;
@@ -614,7 +606,10 @@ int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
                         pgoff_t start, pgoff_t end);
 int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
                   struct osc_object *osc, int async);
-void osc_wake_cache_waiters(struct client_obd *cli);
+static inline void osc_wake_cache_waiters(struct client_obd *cli)
+{
+       wake_up(&cli->cl_cache_waiters);
+}
 
 static inline int osc_io_unplug_async(const struct lu_env *env,
                                      struct client_obd *cli,
index e86cac4..10f22c0 100644 (file)
@@ -215,7 +215,7 @@ struct client_obd {
         * grant before trying to dirty a page and unreserve the rest.
         * See osc_{reserve|unreserve}_grant for details. */
        long                    cl_reserved_grant;
-       struct list_head        cl_cache_waiters; /* waiting for cache/grant */
+       wait_queue_head_t       cl_cache_waiters; /* waiting for cache/grant */
        time64_t                cl_next_shrink_grant;   /* seconds */
        struct list_head        cl_grant_chain;
        time64_t                cl_grant_shrink_interval; /* seconds */
index 43ea9de..6e0ac53 100644 (file)
@@ -366,7 +366,7 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
        /* cl_dirty_max_pages may be changed at connect time in
         * ptlrpc_connect_interpret(). */
        client_adjust_max_dirty(cli);
-       INIT_LIST_HEAD(&cli->cl_cache_waiters);
+       init_waitqueue_head(&cli->cl_cache_waiters);
        INIT_LIST_HEAD(&cli->cl_loi_ready_list);
        INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list);
        INIT_LIST_HEAD(&cli->cl_loi_write_list);
index d3f200e..8026d51 100644 (file)
@@ -1561,15 +1561,26 @@ static int osc_enter_cache_try(struct client_obd *cli,
        return rc;
 }
 
-static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw)
+/* Following two inlines exist to pass code fragments
+ * to wait_event_idle_exclusive_timeout_cmd().  Passing
+ * code fragments as macro args can look confusing, so
+ * we provide inlines to encapsulate them.
+ */
+static inline void cli_unlock_and_unplug(const struct lu_env *env,
+                                        struct client_obd *cli,
+                                        struct osc_async_page *oap)
 {
-       int rc;
-       spin_lock(&cli->cl_loi_list_lock);
-       rc = list_empty(&ocw->ocw_entry);
        spin_unlock(&cli->cl_loi_list_lock);
-       return rc;
+       osc_io_unplug_async(env, cli, NULL);
+       CDEBUG(D_CACHE,
+              "%s: sleeping for cache space for %p\n",
+              cli_name(cli), oap);
 }
 
+static inline void cli_lock_after_unplug(struct client_obd *cli)
+{
+       spin_lock(&cli->cl_loi_list_lock);
+}
 /**
  * The main entry to reserve dirty page accounting. Usually the grant reserved
  * in this function will be freed in bulk in osc_free_grant() unless it fails
@@ -1582,13 +1593,12 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli,
 {
        struct osc_object       *osc = oap->oap_obj;
        struct lov_oinfo        *loi = osc->oo_oinfo;
-       struct osc_cache_waiter  ocw;
-       struct l_wait_info       lwi;
        int                      rc = -EDQUOT;
-       ENTRY;
+       unsigned long timeout = cfs_time_seconds(AT_OFF ? obd_timeout : at_max);
+       int remain;
+       bool entered = false;
 
-       lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(AT_OFF ? obd_timeout : at_max),
-                              NULL, LWI_ON_SIGNAL_NOOP, NULL);
+       ENTRY;
 
        OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes);
 
@@ -1603,76 +1613,40 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli,
                GOTO(out, rc = -EDQUOT);
        }
 
-       /* Hopefully normal case - cache space and write credits available */
-       if (osc_enter_cache_try(cli, oap, bytes, 0)) {
-               OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n");
-               GOTO(out, rc = 0);
-       }
-
-       /* We can get here for two reasons: too many dirty pages in cache, or
+       /*
+        * We can wait here for two reasons: too many dirty pages in cache, or
         * run out of grants. In both cases we should write dirty pages out.
         * Adding a cache waiter will trigger urgent write-out no matter what
         * RPC size will be.
-        * The exiting condition is no avail grants and no dirty pages caching,
-        * that really means there is no space on the OST. */
-       init_waitqueue_head(&ocw.ocw_waitq);
-       ocw.ocw_oap   = oap;
-       ocw.ocw_grant = bytes;
-       while (cli->cl_dirty_pages > 0 || cli->cl_w_in_flight > 0) {
-               list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters);
-               ocw.ocw_rc = 0;
-               spin_unlock(&cli->cl_loi_list_lock);
-
-               osc_io_unplug_async(env, cli, NULL);
-
-               CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n",
-                      cli_name(cli), &ocw, oap);
-
-               rc = l_wait_event(ocw.ocw_waitq, ocw_granted(cli, &ocw), &lwi);
-
-               spin_lock(&cli->cl_loi_list_lock);
-
-               if (rc < 0) {
-                       /* l_wait_event is interrupted by signal or timed out */
-                       list_del_init(&ocw.ocw_entry);
-                       break;
-               }
-               LASSERT(list_empty(&ocw.ocw_entry));
-               rc = ocw.ocw_rc;
-
-               if (rc != -EDQUOT)
-                       break;
-               if (osc_enter_cache_try(cli, oap, bytes, 0)) {
-                       rc = 0;
-                       break;
-               }
-       }
-
-       switch (rc) {
-       case 0:
-               OSC_DUMP_GRANT(D_CACHE, cli, "finally got grant space\n");
-               break;
-       case -ETIMEDOUT:
+        * The exiting condition (other than success) is no avail grants
+        * and no dirty pages caching, that really means there is no space
+        * on the OST.
+        */
+       remain = wait_event_idle_exclusive_timeout_cmd(
+               cli->cl_cache_waiters,
+               (entered = osc_enter_cache_try(cli, oap, bytes, 0)) ||
+               (cli->cl_dirty_pages == 0 && cli->cl_w_in_flight == 0),
+               timeout,
+               cli_unlock_and_unplug(env, cli, oap),
+               cli_lock_after_unplug(cli));
+
+       if (entered) {
+               if (remain == timeout)
+                       OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n");
+               else
+                       OSC_DUMP_GRANT(D_CACHE, cli,
+                                      "finally got grant space\n");
+               wake_up(&cli->cl_cache_waiters);
+               rc = 0;
+       } else if (remain == 0) {
                OSC_DUMP_GRANT(D_CACHE, cli,
                               "timeout, fall back to sync i/o\n");
                osc_extent_tree_dump(D_CACHE, osc);
                /* fall back to synchronous I/O */
-               rc = -EDQUOT;
-               break;
-       case -EINTR:
-               /* Ensures restartability - LU-3581 */
-               OSC_DUMP_GRANT(D_CACHE, cli, "interrupted\n");
-               rc = -ERESTARTSYS;
-               break;
-       case -EDQUOT:
+       } else {
                OSC_DUMP_GRANT(D_CACHE, cli,
                               "no grant space, fall back to sync i/o\n");
-               break;
-       default:
-               CDEBUG(D_CACHE, "%s: event for cache space @ %p never arrived "
-                      "due to %d, fall back to sync i/o\n",
-                      cli_name(cli), &ocw, rc);
-               break;
+               wake_up_all(&cli->cl_cache_waiters);
        }
        EXIT;
 out:
@@ -1680,42 +1654,6 @@ out:
        RETURN(rc);
 }
 
-/* caller must hold loi_list_lock */
-void osc_wake_cache_waiters(struct client_obd *cli)
-{
-       struct list_head *l, *tmp;
-       struct osc_cache_waiter *ocw;
-
-       ENTRY;
-       list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
-               ocw = list_entry(l, struct osc_cache_waiter, ocw_entry);
-               list_del_init(&ocw->ocw_entry);
-
-               ocw->ocw_rc = -EDQUOT;
-               /* we can't dirty more */
-               if ((cli->cl_dirty_pages  >= cli->cl_dirty_max_pages) ||
-                   (1 + atomic_long_read(&obd_dirty_pages) >
-                    obd_max_dirty_pages)) {
-                       CDEBUG(D_CACHE, "no dirty room: dirty: %ld "
-                              "osc max %ld, sys max %ld\n",
-                              cli->cl_dirty_pages, cli->cl_dirty_max_pages,
-                              obd_max_dirty_pages);
-                       goto wakeup;
-               }
-
-               if (osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0))
-                       ocw->ocw_rc = 0;
-wakeup:
-               CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld, %d\n",
-                      ocw, ocw->ocw_oap, cli->cl_avail_grant, ocw->ocw_rc);
-
-               wake_up(&ocw->ocw_waitq);
-       }
-
-       EXIT;
-}
-EXPORT_SYMBOL(osc_wake_cache_waiters);
-
 static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc)
 {
        int hprpc = !!list_empty(&osc->oo_hp_exts);
@@ -1755,8 +1693,9 @@ static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc,
                }
                /* trigger a write rpc stream as long as there are dirtiers
                 * waiting for space.  as they're waiting, they're not going to
-                * create more pages to coalesce with what's waiting.. */
-               if (!list_empty(&cli->cl_cache_waiters)) {
+                * create more pages to coalesce with what's waiting..
+                */
+               if (waitqueue_active(&cli->cl_cache_waiters)) {
                        CDEBUG(D_CACHE, "cache waiters forcing RPC\n");
                        RETURN(1);
                }
@@ -2240,8 +2179,9 @@ static struct osc_object *osc_next_obj(struct client_obd *cli)
        /* then if we have cache waiters, return all objects with queued
         * writes.  This is especially important when many small files
         * have filled up the cache and not been fired into rpcs because
-        * they don't pass the nr_pending/object threshhold */
-       if (!list_empty(&cli->cl_cache_waiters) &&
+        * they don't pass the nr_pending/object threshhold
+        */
+       if (waitqueue_active(&cli->cl_cache_waiters) &&
            !list_empty(&cli->cl_loi_write_list))
                RETURN(list_to_obj(&cli->cl_loi_write_list, write_item));
 
index 0ea5ede..b839ccf 100644 (file)
@@ -42,7 +42,6 @@ extern atomic_t osc_pool_req_count;
 extern unsigned int osc_reqpool_maxreqcount;
 extern struct ptlrpc_request_pool *osc_rq_pool;
 
-void osc_wake_cache_waiters(struct client_obd *cli);
 int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes);
 void osc_update_next_shrink(struct client_obd *cli);
 int lru_queue_work(const struct lu_env *env, void *data);
index 6097942..10eb42e 100644 (file)
@@ -140,7 +140,7 @@ static int osc_page_print(const struct lu_env *env,
                          "1< %#x %d %u %s %s > "
                          "2< %lld %u %u %#x %#x | %p %p %p > "
                          "3< %d %lld %d > "
-                         "4< %d %d %d %lu %s | %s %s %s %s > "
+                         "4< %d %d %d %lu %c | %s %s %s %s > "
                          "5< %s %s %s %s | %d %s | %d %s %s>\n",
                          opg, osc_index(opg),
                           /* 1 */
@@ -159,7 +159,7 @@ static int osc_page_print(const struct lu_env *env,
                           cli->cl_r_in_flight, cli->cl_w_in_flight,
                           cli->cl_max_rpcs_in_flight,
                           cli->cl_avail_grant,
-                          osc_list(&cli->cl_cache_waiters),
+                         waitqueue_active(&cli->cl_cache_waiters) ? '+' : '-',
                           osc_list(&cli->cl_loi_ready_list),
                           osc_list(&cli->cl_loi_hp_ready_list),
                           osc_list(&cli->cl_loi_write_list),