From 5b9581087c138305a1d4202d496cfd1494a71f1c Mon Sep 17 00:00:00 2001 From: Oleg Drokin Date: Mon, 11 May 2020 20:05:32 -0400 Subject: [PATCH] LU-9679 osc: convert cl_cache_waiters to a wait_queue. cli->cl_cache_waiters is a list of tasks that need to be woken when grant-space becomes available. This means it is acting much like a wait queue. So let's change it to really be a wait queue. The current implementation adds new waiters to the end of the list, and calls osc_enter_cache_try() on each in order. We can provide the same behaviour by using an exclusive wait, and having each waiter wake the next task when it succeeds. If a waiter notices that success has become impossible, it wakes all other waiters. If a waiter times out, it doesn't wake other - just leaves them to time out themselves. Note that the old code handled -EINTR from the wait function. That is not a possible return value when wait_event_idle* is used, so that case is discarded. As we need wait_event_idle_exclusive_timeout_cmd(), we should fix the bug in that macro - the "might_sleep()" is wrong, as a spinlock might be held at that point. Linux-Commit: 31f45f56ecdf ("lustre: osc_cache: convert cl_cache_waiters to a wait_queue.") Lustre-change: https://review.whamcloud.com/37605 Lustre-commit: b2ede01d1ed77ddc512c013220f6ea8b509e9541 Change-Id: Ib7622ea2daea8f6e59bef95d3b6c5a80d209b81e Signed-off-by: Mr NeilBrown Reviewed-on: https://review.whamcloud.com/38575 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: James Simmons Reviewed-by: Oleg Drokin --- libcfs/include/libcfs/linux/linux-wait.h | 445 ++++++++++++++++++++++++++++++- lustre/include/lustre_osc.h | 13 +- lustre/include/obd.h | 2 +- lustre/ldlm/ldlm_lib.c | 2 +- lustre/osc/osc_cache.c | 162 ++++------- lustre/osc/osc_internal.h | 1 - lustre/osc/osc_page.c | 4 +- 7 files changed, 502 insertions(+), 127 deletions(-) diff --git a/libcfs/include/libcfs/linux/linux-wait.h b/libcfs/include/libcfs/linux/linux-wait.h index a497dce..fd154ba 100644 --- a/libcfs/include/libcfs/linux/linux-wait.h +++ b/libcfs/include/libcfs/linux/linux-wait.h @@ -2,6 +2,8 @@ #ifndef __LIBCFS_LINUX_WAIT_BIT_H #define __LIBCFS_LINUX_WAIT_BIT_H +/* Make sure we can see if we have TASK_NOLOAD */ +#include /* * Linux wait-bit related types and methods: */ @@ -31,6 +33,18 @@ extern long prepare_to_wait_event(wait_queue_head_t *wq_head, wait_queue_entry_t *wq_entry, int state); #endif +/* ___wait_cond_timeout changed number of args in v3.12-rc1-78-g35a2af94c7ce + * so let's define our own ___wait_cond_timeout1 + */ + +#define ___wait_cond_timeout1(condition) \ +({ \ + bool __cond = (condition); \ + if (__cond && !__ret) \ + __ret = 1; \ + __cond || !__ret; \ +}) + #ifndef HAVE_CLEAR_AND_WAKE_UP_BIT /** * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit @@ -110,7 +124,7 @@ do { \ }) #define __wait_var_event_timeout(var, condition, timeout) \ - ___wait_var_event(var, ___wait_cond_timeout(condition), \ + ___wait_var_event(var, ___wait_cond_timeout1(condition), \ TASK_UNINTERRUPTIBLE, 0, timeout, \ __ret = schedule_timeout(__ret)) @@ -118,10 +132,437 @@ do { \ ({ \ long __ret = timeout; \ might_sleep(); \ - if (!___wait_cond_timeout(condition)) \ + if (!___wait_cond_timeout1(condition)) \ __ret = __wait_var_event_timeout(var, condition, timeout); \ __ret; \ }) #endif /* ! HAVE_WAIT_VAR_EVENT */ +/* + * prepare_to_wait_event() does not support an exclusive + * lifo wait. + * However it will not relink the wait_queue_entry if + * it is already linked. So we link to the head of the + * queue here, and it will stay there. + */ +static inline void prepare_to_wait_exclusive_head( + wait_queue_head_t *waitq, wait_queue_entry_t *link) +{ + unsigned long flags; + + spin_lock_irqsave(&(waitq->lock), flags); +#ifdef HAVE_WAIT_QUEUE_ENTRY_LIST + if (list_empty(&link->entry)) +#else + if (list_empty(&link->task_list)) +#endif + __add_wait_queue_exclusive(waitq, link); + spin_unlock_irqrestore(&((waitq)->lock), flags); +} + +#ifndef ___wait_event +/* + * The below macro ___wait_event() has an explicit shadow of the __ret + * variable when used from the wait_event_*() macros. + * + * This is so that both can use the ___wait_cond_timeout1() construct + * to wrap the condition. + * + * The type inconsistency of the wait_event_*() __ret variable is also + * on purpose; we use long where we can return timeout values and int + * otherwise. + */ + +#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd) \ +({ \ + __label__ __out; \ + wait_queue_entry_ __wq_entry; \ + long __ret = ret; /* explicit shadow */ \ + \ + init_wait(&__wq_entry); \ + if (exclusive) \ + __wq_entry.flags = WQ_FLAG_EXCLUSIVE \ + for (;;) { \ + long __int = prepare_to_wait_event(&wq_head, \ + &__wq_entry, state); \ + \ + if (condition) \ + break; \ + \ + if (___wait_is_interruptible(state) && __int) { \ + __ret = __int; \ + goto __out; \ + } \ + \ + cmd; \ + } \ + finish_wait(&wq_head, &__wq_entry); \ +__out: __ret; \ +}) +#endif + +#ifndef TASK_NOLOAD + +#define ___wait_event_idle(wq_head, condition, exclusive, ret, cmd) \ +({ \ + wait_queue_entry_t __wq_entry; \ + unsigned long flags; \ + long __ret = ret; /* explicit shadow */ \ + sigset_t __blocked; \ + \ + __blocked = cfs_block_sigsinv(0); \ + init_wait(&__wq_entry); \ + if (exclusive) \ + __wq_entry.flags = WQ_FLAG_EXCLUSIVE; \ + for (;;) { \ + prepare_to_wait_event(&wq_head, \ + &__wq_entry, \ + TASK_INTERRUPTIBLE); \ + \ + if (condition) \ + break; \ + /* We have to do this here because some signals */ \ + /* are not blockable - ie from strace(1). */ \ + /* In these cases we want to schedule_timeout() */ \ + /* again, because we don't want that to return */ \ + /* -EINTR when the RPC actually succeeded. */ \ + /* the recalc_sigpending() below will deliver the */ \ + /* signal properly. */ \ + if (signal_pending(current)) { \ + spin_lock_irqsave(¤t->sighand->siglock, \ + flags); \ + clear_tsk_thread_flag(current, TIF_SIGPENDING); \ + spin_unlock_irqrestore(¤t->sighand->siglock,\ + flags); \ + } \ + cmd; \ + } \ + finish_wait(&wq_head, &__wq_entry); \ + cfs_restore_sigs(__blocked); \ + __ret; \ +}) + +#define wait_event_idle(wq_head, condition) \ +do { \ + might_sleep(); \ + if (!(condition)) \ + ___wait_event_idle(wq_head, condition, 0, 0, schedule());\ +} while (0) + +#define wait_event_idle_exclusive(wq_head, condition) \ +do { \ + might_sleep(); \ + if (!(condition)) \ + ___wait_event_idle(wq_head, condition, 1, 0, schedule());\ +} while (0) + +#define __wait_event_idle_exclusive_timeout(wq_head, condition, timeout)\ + ___wait_event_idle(wq_head, ___wait_cond_timeout1(condition), \ + 1, timeout, \ + __ret = schedule_timeout(__ret)) + +#define wait_event_idle_exclusive_timeout(wq_head, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout1(condition)) \ + __ret = __wait_event_idle_exclusive_timeout( \ + wq_head, condition, timeout); \ + __ret; \ +}) + +#define __wait_event_idle_exclusive_timeout_cmd(wq_head, condition, \ + timeout, cmd1, cmd2) \ + ___wait_event_idle(wq_head, ___wait_cond_timeout1(condition), \ + 1, timeout, \ + cmd1; __ret = schedule_timeout(__ret); cmd2) + +#define wait_event_idle_exclusive_timeout_cmd(wq_head, condition, timeout,\ + cmd1, cmd2) \ +({ \ + long __ret = timeout; \ + if (!___wait_cond_timeout1(condition)) \ + __ret = __wait_event_idle_exclusive_timeout_cmd( \ + wq_head, condition, timeout, cmd1, cmd2); \ + __ret; \ +}) + +#define __wait_event_idle_timeout(wq_head, condition, timeout) \ + ___wait_event_idle(wq_head, ___wait_cond_timeout1(condition), \ + 0, timeout, \ + __ret = schedule_timeout(__ret)) + +#define wait_event_idle_timeout(wq_head, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout1(condition)) \ + __ret = __wait_event_idle_timeout(wq_head, condition, \ + timeout); \ + __ret; \ +}) + +#else /* TASK_IDLE */ +#ifndef wait_event_idle +/** + * wait_event_idle - wait for a condition without contributing to system load + * @wq_head: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * + * The process is put to sleep (TASK_IDLE) until the + * @condition evaluates to true. + * The @condition is checked each time the waitqueue @wq_head is woken up. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + */ +#define wait_event_idle(wq_head, condition) \ +do { \ + might_sleep(); \ + if (!(condition)) \ + ___wait_event(wq_head, condition, TASK_IDLE, 0, 0, \ + schedule()); \ +} while (0) +#endif +#ifndef wait_event_idle_exclusive +/** + * wait_event_idle_exclusive - wait for a condition without contributing to + * system load + * @wq_head: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * + * The process is put to sleep (TASK_IDLE) until the + * @condition evaluates to true. + * The @condition is checked each time the waitqueue @wq_head is woken up. + * + * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag + * set thus if other processes wait on the same list, when this + * process is woken further processes are not considered. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + */ +#define wait_event_idle_exclusive(wq_head, condition) \ +do { \ + might_sleep(); \ + if (!(condition)) \ + ___wait_event(wq_head, condition, TASK_IDLE, 1, 0, \ + schedule()); \ +} while (0) +#endif +#ifndef wait_event_idle_exclusive_timeout +/** + * wait_event_idle_exclusive_timeout - sleep without load until a condition + * becomes true or a timeout elapses + * @wq_head: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * @timeout: timeout, in jiffies + * + * The process is put to sleep (TASK_IDLE) until the + * @condition evaluates to true. The @condition is checked each time + * the waitqueue @wq_head is woken up. + * + * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag + * set thus if other processes wait on the same list, when this + * process is woken further processes are not considered. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + * Returns: + * 0 if the @condition evaluated to %false after the @timeout elapsed, + * 1 if the @condition evaluated to %true after the @timeout elapsed, + * or the remaining jiffies (at least 1) if the @condition evaluated + * to %true before the @timeout elapsed. + */ +#define wait_event_idle_exclusive_timeout(wq_head, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout1(condition)) \ + __ret = __wait_event_idle_exclusive_timeout(wq_head, \ + condition, \ + timeout); \ + __ret; \ +}) +#endif +#ifndef wait_event_idle_exclusive_timeout_cmd +#define __wait_event_idle_exclusive_timeout_cmd(wq_head, condition, \ + timeout, cmd1, cmd2) \ + ___wait_event(wq_head, ___wait_cond_timeout1(condition), \ + TASK_IDLE, 1, timeout, \ + cmd1; __ret = schedule_timeout(__ret); cmd2) + +#define wait_event_idle_exclusive_timeout_cmd(wq_head, condition, timeout,\ + cmd1, cmd2) \ +({ \ + long __ret = timeout; \ + if (!___wait_cond_timeout1(condition)) \ + __ret = __wait_event_idle_exclusive_timeout_cmd( \ + wq_head, condition, timeout, cmd1, cmd2); \ + __ret; \ +}) +#endif + +#ifndef wait_event_idle_timeout + +#define __wait_event_idle_timeout(wq_head, condition, timeout) \ + ___wait_event(wq_head, ___wait_cond_timeout1(condition), \ + TASK_IDLE, 0, timeout, \ + __ret = schedule_timeout(__ret)) + +/** + * wait_event_idle_timeout - sleep without load until a condition becomes + * true or a timeout elapses + * @wq_head: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * @timeout: timeout, in jiffies + * + * The process is put to sleep (TASK_IDLE) until the + * @condition evaluates to true. The @condition is checked each time + * the waitqueue @wq_head is woken up. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + * Returns: + * 0 if the @condition evaluated to %false after the @timeout elapsed, + * 1 if the @condition evaluated to %true after the @timeout elapsed, + * or the remaining jiffies (at least 1) if the @condition evaluated + * to %true before the @timeout elapsed. + */ +#define wait_event_idle_timeout(wq_head, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout1(condition)) \ + __ret = __wait_event_idle_timeout(wq_head, condition, \ + timeout); \ + __ret; \ +}) +#endif +#endif /* TASK_IDLE */ + +/* ___wait_event_lifo is used for lifo exclusive 'idle' waits */ +#ifdef TASK_NOLOAD + +#define ___wait_event_lifo(wq_head, condition, ret, cmd) \ +({ \ + wait_queue_entry_t __wq_entry; \ + long __ret = ret; /* explicit shadow */ \ + \ + init_wait(&__wq_entry); \ + __wq_entry.flags = WQ_FLAG_EXCLUSIVE; \ + for (;;) { \ + prepare_to_wait_exclusive_head(&wq_head, &__wq_entry); \ + prepare_to_wait_event(&wq_head, &__wq_entry, TASK_IDLE);\ + \ + if (condition) \ + break; \ + \ + cmd; \ + } \ + finish_wait(&wq_head, &__wq_entry); \ + __ret; \ +}) +#else +#define ___wait_event_lifo(wq_head, condition, ret, cmd) \ +({ \ + wait_queue_entry_t __wq_entry; \ + unsigned long flags; \ + long __ret = ret; /* explicit shadow */ \ + sigset_t __blocked; \ + \ + __blocked = cfs_block_sigsinv(0); \ + init_wait(&__wq_entry); \ + __wq_entry.flags = WQ_FLAG_EXCLUSIVE; \ + for (;;) { \ + prepare_to_wait_exclusive_head(&wq_head, &__wq_entry); \ + prepare_to_wait_event(&wq_head, &__wq_entry, \ + TASK_INTERRUPTIBLE); \ + \ + if (condition) \ + break; \ + /* See justification in ___wait_event_idle */ \ + if (signal_pending(current)) { \ + spin_lock_irqsave(¤t->sighand->siglock, \ + flags); \ + clear_tsk_thread_flag(current, TIF_SIGPENDING); \ + spin_unlock_irqrestore(¤t->sighand->siglock,\ + flags); \ + } \ + cmd; \ + } \ + cfs_restore_sigs(__blocked); \ + finish_wait(&wq_head, &__wq_entry); \ + __ret; \ +}) +#endif + +#define wait_event_idle_exclusive_lifo(wq_head, condition) \ +do { \ + might_sleep(); \ + if (!(condition)) \ + ___wait_event_lifo(wq_head, condition, 0, schedule()); \ +} while (0) + +#define __wait_event_idle_lifo_timeout(wq_head, condition, timeout) \ + ___wait_event_lifo(wq_head, ___wait_cond_timeout1(condition), \ + timeout, \ + __ret = schedule_timeout(__ret)) + +#define wait_event_idle_exclusive_lifo_timeout(wq_head, condition, timeout)\ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout1(condition)) \ + __ret = __wait_event_idle_lifo_timeout(wq_head, \ + condition, \ + timeout); \ + __ret; \ +}) + +/* l_wait_event_abortable() is a bit like wait_event_killable() + * except there is a fixed set of signals which will abort: + * LUSTRE_FATAL_SIGS + */ +#define LUSTRE_FATAL_SIGS \ + (sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGTERM) | \ + sigmask(SIGQUIT) | sigmask(SIGALRM)) + +#define l_wait_event_abortable(wq, condition) \ +({ \ + sigset_t __new_blocked, __old_blocked; \ + int __ret = 0; \ + siginitsetinv(&__new_blocked, LUSTRE_FATAL_SIGS); \ + sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked); \ + __ret = wait_event_interruptible(wq, condition); \ + sigprocmask(SIG_SETMASK, &__old_blocked, NULL); \ + __ret; \ +}) + +#define l_wait_event_abortable_timeout(wq, condition, timeout) \ +({ \ + sigset_t __new_blocked, __old_blocked; \ + int __ret = 0; \ + siginitsetinv(&__new_blocked, LUSTRE_FATAL_SIGS); \ + sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked); \ + __ret = wait_event_interruptible_timeout(wq, condition, timeout);\ + sigprocmask(SIG_SETMASK, &__old_blocked, NULL); \ + __ret; \ +}) + +#define l_wait_event_abortable_exclusive(wq, condition) \ +({ \ + sigset_t __new_blocked, __old_blocked; \ + int __ret = 0; \ + siginitsetinv(&__new_blocked, LUSTRE_FATAL_SIGS); \ + sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked); \ + __ret = wait_event_interruptible_exclusive(wq, condition); \ + sigprocmask(SIG_SETMASK, &__old_blocked, NULL); \ + __ret; \ +}) + #endif /* __LICBFS_LINUX_WAIT_BIT_H */ diff --git a/lustre/include/lustre_osc.h b/lustre/include/lustre_osc.h index 32a2c63..c53cba8 100644 --- a/lustre/include/lustre_osc.h +++ b/lustre/include/lustre_osc.h @@ -100,14 +100,6 @@ static inline struct osc_async_page *brw_page2oap(struct brw_page *pga) return container_of(pga, struct osc_async_page, oap_brw_page); } -struct osc_cache_waiter { - struct list_head ocw_entry; - wait_queue_head_t ocw_waitq; - struct osc_async_page *ocw_oap; - int ocw_grant; - int ocw_rc; -}; - struct osc_device { struct cl_device od_cl; struct obd_export *od_exp; @@ -614,7 +606,10 @@ int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj, pgoff_t start, pgoff_t end); int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli, struct osc_object *osc, int async); -void osc_wake_cache_waiters(struct client_obd *cli); +static inline void osc_wake_cache_waiters(struct client_obd *cli) +{ + wake_up(&cli->cl_cache_waiters); +} static inline int osc_io_unplug_async(const struct lu_env *env, struct client_obd *cli, diff --git a/lustre/include/obd.h b/lustre/include/obd.h index e86cac4..10f22c0 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -215,7 +215,7 @@ struct client_obd { * grant before trying to dirty a page and unreserve the rest. * See osc_{reserve|unreserve}_grant for details. */ long cl_reserved_grant; - struct list_head cl_cache_waiters; /* waiting for cache/grant */ + wait_queue_head_t cl_cache_waiters; /* waiting for cache/grant */ time64_t cl_next_shrink_grant; /* seconds */ struct list_head cl_grant_chain; time64_t cl_grant_shrink_interval; /* seconds */ diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 43ea9de..6e0ac53 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -366,7 +366,7 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) /* cl_dirty_max_pages may be changed at connect time in * ptlrpc_connect_interpret(). */ client_adjust_max_dirty(cli); - INIT_LIST_HEAD(&cli->cl_cache_waiters); + init_waitqueue_head(&cli->cl_cache_waiters); INIT_LIST_HEAD(&cli->cl_loi_ready_list); INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list); INIT_LIST_HEAD(&cli->cl_loi_write_list); diff --git a/lustre/osc/osc_cache.c b/lustre/osc/osc_cache.c index d3f200e..8026d51 100644 --- a/lustre/osc/osc_cache.c +++ b/lustre/osc/osc_cache.c @@ -1561,15 +1561,26 @@ static int osc_enter_cache_try(struct client_obd *cli, return rc; } -static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw) +/* Following two inlines exist to pass code fragments + * to wait_event_idle_exclusive_timeout_cmd(). Passing + * code fragments as macro args can look confusing, so + * we provide inlines to encapsulate them. + */ +static inline void cli_unlock_and_unplug(const struct lu_env *env, + struct client_obd *cli, + struct osc_async_page *oap) { - int rc; - spin_lock(&cli->cl_loi_list_lock); - rc = list_empty(&ocw->ocw_entry); spin_unlock(&cli->cl_loi_list_lock); - return rc; + osc_io_unplug_async(env, cli, NULL); + CDEBUG(D_CACHE, + "%s: sleeping for cache space for %p\n", + cli_name(cli), oap); } +static inline void cli_lock_after_unplug(struct client_obd *cli) +{ + spin_lock(&cli->cl_loi_list_lock); +} /** * The main entry to reserve dirty page accounting. Usually the grant reserved * in this function will be freed in bulk in osc_free_grant() unless it fails @@ -1582,13 +1593,12 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, { struct osc_object *osc = oap->oap_obj; struct lov_oinfo *loi = osc->oo_oinfo; - struct osc_cache_waiter ocw; - struct l_wait_info lwi; int rc = -EDQUOT; - ENTRY; + unsigned long timeout = cfs_time_seconds(AT_OFF ? obd_timeout : at_max); + int remain; + bool entered = false; - lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(AT_OFF ? obd_timeout : at_max), - NULL, LWI_ON_SIGNAL_NOOP, NULL); + ENTRY; OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes); @@ -1603,76 +1613,40 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, GOTO(out, rc = -EDQUOT); } - /* Hopefully normal case - cache space and write credits available */ - if (osc_enter_cache_try(cli, oap, bytes, 0)) { - OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n"); - GOTO(out, rc = 0); - } - - /* We can get here for two reasons: too many dirty pages in cache, or + /* + * We can wait here for two reasons: too many dirty pages in cache, or * run out of grants. In both cases we should write dirty pages out. * Adding a cache waiter will trigger urgent write-out no matter what * RPC size will be. - * The exiting condition is no avail grants and no dirty pages caching, - * that really means there is no space on the OST. */ - init_waitqueue_head(&ocw.ocw_waitq); - ocw.ocw_oap = oap; - ocw.ocw_grant = bytes; - while (cli->cl_dirty_pages > 0 || cli->cl_w_in_flight > 0) { - list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters); - ocw.ocw_rc = 0; - spin_unlock(&cli->cl_loi_list_lock); - - osc_io_unplug_async(env, cli, NULL); - - CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n", - cli_name(cli), &ocw, oap); - - rc = l_wait_event(ocw.ocw_waitq, ocw_granted(cli, &ocw), &lwi); - - spin_lock(&cli->cl_loi_list_lock); - - if (rc < 0) { - /* l_wait_event is interrupted by signal or timed out */ - list_del_init(&ocw.ocw_entry); - break; - } - LASSERT(list_empty(&ocw.ocw_entry)); - rc = ocw.ocw_rc; - - if (rc != -EDQUOT) - break; - if (osc_enter_cache_try(cli, oap, bytes, 0)) { - rc = 0; - break; - } - } - - switch (rc) { - case 0: - OSC_DUMP_GRANT(D_CACHE, cli, "finally got grant space\n"); - break; - case -ETIMEDOUT: + * The exiting condition (other than success) is no avail grants + * and no dirty pages caching, that really means there is no space + * on the OST. + */ + remain = wait_event_idle_exclusive_timeout_cmd( + cli->cl_cache_waiters, + (entered = osc_enter_cache_try(cli, oap, bytes, 0)) || + (cli->cl_dirty_pages == 0 && cli->cl_w_in_flight == 0), + timeout, + cli_unlock_and_unplug(env, cli, oap), + cli_lock_after_unplug(cli)); + + if (entered) { + if (remain == timeout) + OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n"); + else + OSC_DUMP_GRANT(D_CACHE, cli, + "finally got grant space\n"); + wake_up(&cli->cl_cache_waiters); + rc = 0; + } else if (remain == 0) { OSC_DUMP_GRANT(D_CACHE, cli, "timeout, fall back to sync i/o\n"); osc_extent_tree_dump(D_CACHE, osc); /* fall back to synchronous I/O */ - rc = -EDQUOT; - break; - case -EINTR: - /* Ensures restartability - LU-3581 */ - OSC_DUMP_GRANT(D_CACHE, cli, "interrupted\n"); - rc = -ERESTARTSYS; - break; - case -EDQUOT: + } else { OSC_DUMP_GRANT(D_CACHE, cli, "no grant space, fall back to sync i/o\n"); - break; - default: - CDEBUG(D_CACHE, "%s: event for cache space @ %p never arrived " - "due to %d, fall back to sync i/o\n", - cli_name(cli), &ocw, rc); - break; + wake_up_all(&cli->cl_cache_waiters); } EXIT; out: @@ -1680,42 +1654,6 @@ out: RETURN(rc); } -/* caller must hold loi_list_lock */ -void osc_wake_cache_waiters(struct client_obd *cli) -{ - struct list_head *l, *tmp; - struct osc_cache_waiter *ocw; - - ENTRY; - list_for_each_safe(l, tmp, &cli->cl_cache_waiters) { - ocw = list_entry(l, struct osc_cache_waiter, ocw_entry); - list_del_init(&ocw->ocw_entry); - - ocw->ocw_rc = -EDQUOT; - /* we can't dirty more */ - if ((cli->cl_dirty_pages >= cli->cl_dirty_max_pages) || - (1 + atomic_long_read(&obd_dirty_pages) > - obd_max_dirty_pages)) { - CDEBUG(D_CACHE, "no dirty room: dirty: %ld " - "osc max %ld, sys max %ld\n", - cli->cl_dirty_pages, cli->cl_dirty_max_pages, - obd_max_dirty_pages); - goto wakeup; - } - - if (osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0)) - ocw->ocw_rc = 0; -wakeup: - CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld, %d\n", - ocw, ocw->ocw_oap, cli->cl_avail_grant, ocw->ocw_rc); - - wake_up(&ocw->ocw_waitq); - } - - EXIT; -} -EXPORT_SYMBOL(osc_wake_cache_waiters); - static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc) { int hprpc = !!list_empty(&osc->oo_hp_exts); @@ -1755,8 +1693,9 @@ static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc, } /* trigger a write rpc stream as long as there are dirtiers * waiting for space. as they're waiting, they're not going to - * create more pages to coalesce with what's waiting.. */ - if (!list_empty(&cli->cl_cache_waiters)) { + * create more pages to coalesce with what's waiting.. + */ + if (waitqueue_active(&cli->cl_cache_waiters)) { CDEBUG(D_CACHE, "cache waiters forcing RPC\n"); RETURN(1); } @@ -2240,8 +2179,9 @@ static struct osc_object *osc_next_obj(struct client_obd *cli) /* then if we have cache waiters, return all objects with queued * writes. This is especially important when many small files * have filled up the cache and not been fired into rpcs because - * they don't pass the nr_pending/object threshhold */ - if (!list_empty(&cli->cl_cache_waiters) && + * they don't pass the nr_pending/object threshhold + */ + if (waitqueue_active(&cli->cl_cache_waiters) && !list_empty(&cli->cl_loi_write_list)) RETURN(list_to_obj(&cli->cl_loi_write_list, write_item)); diff --git a/lustre/osc/osc_internal.h b/lustre/osc/osc_internal.h index 0ea5ede..b839ccf 100644 --- a/lustre/osc/osc_internal.h +++ b/lustre/osc/osc_internal.h @@ -42,7 +42,6 @@ extern atomic_t osc_pool_req_count; extern unsigned int osc_reqpool_maxreqcount; extern struct ptlrpc_request_pool *osc_rq_pool; -void osc_wake_cache_waiters(struct client_obd *cli); int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes); void osc_update_next_shrink(struct client_obd *cli); int lru_queue_work(const struct lu_env *env, void *data); diff --git a/lustre/osc/osc_page.c b/lustre/osc/osc_page.c index 6097942..10eb42e3 100644 --- a/lustre/osc/osc_page.c +++ b/lustre/osc/osc_page.c @@ -140,7 +140,7 @@ static int osc_page_print(const struct lu_env *env, "1< %#x %d %u %s %s > " "2< %lld %u %u %#x %#x | %p %p %p > " "3< %d %lld %d > " - "4< %d %d %d %lu %s | %s %s %s %s > " + "4< %d %d %d %lu %c | %s %s %s %s > " "5< %s %s %s %s | %d %s | %d %s %s>\n", opg, osc_index(opg), /* 1 */ @@ -159,7 +159,7 @@ static int osc_page_print(const struct lu_env *env, cli->cl_r_in_flight, cli->cl_w_in_flight, cli->cl_max_rpcs_in_flight, cli->cl_avail_grant, - osc_list(&cli->cl_cache_waiters), + waitqueue_active(&cli->cl_cache_waiters) ? '+' : '-', osc_list(&cli->cl_loi_ready_list), osc_list(&cli->cl_loi_hp_ready_list), osc_list(&cli->cl_loi_write_list), -- 1.8.3.1