From b2ede01d1ed77ddc512c013220f6ea8b509e9541 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 14 Dec 2018 14:48:45 +1100 Subject: [PATCH] LU-9679 osc: convert cl_cache_waiters to a wait_queue. cli->cl_cache_waiters is a list of tasks that need to be woken when grant-space becomes available. This means it is acting much like a wait queue. So let's change it to really be a wait queue. The current implementation adds new waiters to the end of the list, and calls osc_enter_cache_try() on each in order. We can provide the same behaviour by using an exclusive wait, and having each waiter wake the next task when it succeeds. If a waiter notices that success has become impossible, it wakes all other waiters. If a waiter times out, it doesn't wake other - just leaves them to time out themselves. Note that the old code handled -EINTR from the wait function. That is not a possible return value when wait_event_idle* is used, so that case is discarded. As we need wait_event_idle_exclusive_timeout_cmd(), we should fix the bug in that macro - the "might_sleep()" is wrong, as a spinlock might be held at that point. Linux-Commit: 31f45f56ecdf ("lustre: osc_cache: convert cl_cache_waiters to a wait_queue.") Signed-off-by: Mr NeilBrown Change-Id: Ib7622ea2daea8f6e59bef95d3b6c5a80d209b81e Reviewed-on: https://review.whamcloud.com/37605 Reviewed-by: Andreas Dilger Tested-by: jenkins Tested-by: Maloo Reviewed-by: James Simmons Reviewed-by: Oleg Drokin --- libcfs/include/libcfs/linux/linux-wait.h | 2 - lustre/include/lustre_osc.h | 13 +-- lustre/include/obd.h | 2 +- lustre/ldlm/ldlm_lib.c | 2 +- lustre/osc/osc_cache.c | 159 ++++++++++--------------------- lustre/osc/osc_internal.h | 1 - lustre/osc/osc_page.c | 2 +- 7 files changed, 58 insertions(+), 123 deletions(-) diff --git a/libcfs/include/libcfs/linux/linux-wait.h b/libcfs/include/libcfs/linux/linux-wait.h index 8fac3db..fd154ba 100644 --- a/libcfs/include/libcfs/linux/linux-wait.h +++ b/libcfs/include/libcfs/linux/linux-wait.h @@ -281,7 +281,6 @@ do { \ cmd1, cmd2) \ ({ \ long __ret = timeout; \ - might_sleep(); \ if (!___wait_cond_timeout1(condition)) \ __ret = __wait_event_idle_exclusive_timeout_cmd( \ wq_head, condition, timeout, cmd1, cmd2); \ @@ -400,7 +399,6 @@ do { \ cmd1, cmd2) \ ({ \ long __ret = timeout; \ - might_sleep(); \ if (!___wait_cond_timeout1(condition)) \ __ret = __wait_event_idle_exclusive_timeout_cmd( \ wq_head, condition, timeout, cmd1, cmd2); \ diff --git a/lustre/include/lustre_osc.h b/lustre/include/lustre_osc.h index 4c243ed..9a0fd6a 100644 --- a/lustre/include/lustre_osc.h +++ b/lustre/include/lustre_osc.h @@ -99,14 +99,6 @@ static inline struct osc_async_page *brw_page2oap(struct brw_page *pga) return container_of(pga, struct osc_async_page, oap_brw_page); } -struct osc_cache_waiter { - struct list_head ocw_entry; - wait_queue_head_t ocw_waitq; - struct osc_async_page *ocw_oap; - int ocw_grant; - int ocw_rc; -}; - struct osc_device { struct cl_device od_cl; struct obd_export *od_exp; @@ -598,7 +590,10 @@ int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj, pgoff_t start, pgoff_t end); int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli, struct osc_object *osc, int async); -void osc_wake_cache_waiters(struct client_obd *cli); +static inline void osc_wake_cache_waiters(struct client_obd *cli) +{ + wake_up(&cli->cl_cache_waiters); +} static inline int osc_io_unplug_async(const struct lu_env *env, struct client_obd *cli, diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 6a5f2dc..9cbc8f7 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -221,7 +221,7 @@ struct client_obd { * grant before trying to dirty a page and unreserve the rest. * See osc_{reserve|unreserve}_grant for details. */ long cl_reserved_grant; - struct list_head cl_cache_waiters; /* waiting for cache/grant */ + wait_queue_head_t cl_cache_waiters; /* waiting for cache/grant */ time64_t cl_next_shrink_grant; /* seconds */ struct list_head cl_grant_chain; time64_t cl_grant_shrink_interval; /* seconds */ diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 8a0ea40..6d14580 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -378,7 +378,7 @@ int client_obd_setup(struct obd_device *obd, struct lustre_cfg *lcfg) * ptlrpc_connect_interpret(). */ client_adjust_max_dirty(cli); - INIT_LIST_HEAD(&cli->cl_cache_waiters); + init_waitqueue_head(&cli->cl_cache_waiters); INIT_LIST_HEAD(&cli->cl_loi_ready_list); INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list); INIT_LIST_HEAD(&cli->cl_loi_write_list); diff --git a/lustre/osc/osc_cache.c b/lustre/osc/osc_cache.c index 56fd9e4..1f343e2 100644 --- a/lustre/osc/osc_cache.c +++ b/lustre/osc/osc_cache.c @@ -1533,15 +1533,26 @@ out: return rc; } -static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw) +/* Following two inlines exist to pass code fragments + * to wait_event_idle_exclusive_timeout_cmd(). Passing + * code fragments as macro args can look confusing, so + * we provide inlines to encapsulate them. + */ +static inline void cli_unlock_and_unplug(const struct lu_env *env, + struct client_obd *cli, + struct osc_async_page *oap) { - int rc; - spin_lock(&cli->cl_loi_list_lock); - rc = list_empty(&ocw->ocw_entry); spin_unlock(&cli->cl_loi_list_lock); - return rc; + osc_io_unplug_async(env, cli, NULL); + CDEBUG(D_CACHE, + "%s: sleeping for cache space for %p\n", + cli_name(cli), oap); } +static inline void cli_lock_after_unplug(struct client_obd *cli) +{ + spin_lock(&cli->cl_loi_list_lock); +} /** * The main entry to reserve dirty page accounting. Usually the grant reserved * in this function will be freed in bulk in osc_free_grant() unless it fails @@ -1554,8 +1565,11 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, { struct osc_object *osc = oap->oap_obj; struct lov_oinfo *loi = osc->oo_oinfo; - struct osc_cache_waiter ocw; int rc = -EDQUOT; + unsigned long timeout = cfs_time_seconds(AT_OFF ? obd_timeout : at_max); + int remain; + bool entered = false; + ENTRY; OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes); @@ -1571,83 +1585,40 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, GOTO(out, rc = -EDQUOT); } - /* Hopefully normal case - cache space and write credits available */ - if (list_empty(&cli->cl_cache_waiters) && - osc_enter_cache_try(cli, oap, bytes)) { - OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n"); - GOTO(out, rc = 0); - } - - /* We can get here for two reasons: too many dirty pages in cache, or + /* + * We can wait here for two reasons: too many dirty pages in cache, or * run out of grants. In both cases we should write dirty pages out. * Adding a cache waiter will trigger urgent write-out no matter what * RPC size will be. - * The exiting condition is no avail grants and no dirty pages caching, - * that really means there is no space on the OST. */ - init_waitqueue_head(&ocw.ocw_waitq); - ocw.ocw_oap = oap; - ocw.ocw_grant = bytes; - while (cli->cl_dirty_pages > 0 || cli->cl_w_in_flight > 0) { - list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters); - ocw.ocw_rc = 0; - spin_unlock(&cli->cl_loi_list_lock); - - osc_io_unplug_async(env, cli, NULL); - - CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n", - cli_name(cli), &ocw, oap); - - rc = wait_event_idle_timeout(ocw.ocw_waitq, - ocw_granted(cli, &ocw), - cfs_time_seconds(AT_OFF ? - obd_timeout : - at_max)); - - spin_lock(&cli->cl_loi_list_lock); - - if (rc <= 0) { - /* wait_event_idle_timeout timed out */ - list_del_init(&ocw.ocw_entry); - if (rc == 0) - rc = -ETIMEDOUT; - break; - } - LASSERT(list_empty(&ocw.ocw_entry)); - rc = ocw.ocw_rc; - - if (rc != -EDQUOT) - break; - if (osc_enter_cache_try(cli, oap, bytes)) { - rc = 0; - break; - } - } - - switch (rc) { - case 0: - OSC_DUMP_GRANT(D_CACHE, cli, "finally got grant space\n"); - break; - case -ETIMEDOUT: + * The exiting condition (other than success) is no avail grants + * and no dirty pages caching, that really means there is no space + * on the OST. + */ + remain = wait_event_idle_exclusive_timeout_cmd( + cli->cl_cache_waiters, + (entered = osc_enter_cache_try(cli, oap, bytes)) || + (cli->cl_dirty_pages == 0 && cli->cl_w_in_flight == 0), + timeout, + cli_unlock_and_unplug(env, cli, oap), + cli_lock_after_unplug(cli)); + + if (entered) { + if (remain == timeout) + OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n"); + else + OSC_DUMP_GRANT(D_CACHE, cli, + "finally got grant space\n"); + wake_up(&cli->cl_cache_waiters); + rc = 0; + } else if (remain == 0) { OSC_DUMP_GRANT(D_CACHE, cli, "timeout, fall back to sync i/o\n"); osc_extent_tree_dump(D_CACHE, osc); /* fall back to synchronous I/O */ - rc = -EDQUOT; - break; - case -EINTR: - /* Ensures restartability - LU-3581 */ - OSC_DUMP_GRANT(D_CACHE, cli, "interrupted\n"); - rc = -ERESTARTSYS; - break; - case -EDQUOT: + } else { OSC_DUMP_GRANT(D_CACHE, cli, "no grant space, fall back to sync i/o\n"); - break; - default: - CDEBUG(D_CACHE, "%s: event for cache space @ %p never arrived " - "due to %d, fall back to sync i/o\n", - cli_name(cli), &ocw, rc); - break; + wake_up_all(&cli->cl_cache_waiters); } EXIT; out: @@ -1655,36 +1626,6 @@ out: RETURN(rc); } -/* caller must hold loi_list_lock */ -void osc_wake_cache_waiters(struct client_obd *cli) -{ - struct list_head *l, *tmp; - struct osc_cache_waiter *ocw; - - ENTRY; - list_for_each_safe(l, tmp, &cli->cl_cache_waiters) { - ocw = list_entry(l, struct osc_cache_waiter, ocw_entry); - - ocw->ocw_rc = -EDQUOT; - - if (osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant)) - ocw->ocw_rc = 0; - - if (ocw->ocw_rc == 0 || - !(cli->cl_dirty_pages > 0 || cli->cl_w_in_flight > 0)) { - list_del_init(&ocw->ocw_entry); - CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant " - "%ld, %d\n", ocw, ocw->ocw_oap, - cli->cl_avail_grant, ocw->ocw_rc); - - wake_up(&ocw->ocw_waitq); - } - } - - EXIT; -} -EXPORT_SYMBOL(osc_wake_cache_waiters); - static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc) { int hprpc = !!list_empty(&osc->oo_hp_exts); @@ -1724,8 +1665,9 @@ static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc, } /* trigger a write rpc stream as long as there are dirtiers * waiting for space. as they're waiting, they're not going to - * create more pages to coalesce with what's waiting.. */ - if (!list_empty(&cli->cl_cache_waiters)) { + * create more pages to coalesce with what's waiting.. + */ + if (waitqueue_active(&cli->cl_cache_waiters)) { CDEBUG(D_CACHE, "cache waiters forcing RPC\n"); RETURN(1); } @@ -2177,8 +2119,9 @@ static struct osc_object *osc_next_obj(struct client_obd *cli) /* then if we have cache waiters, return all objects with queued * writes. This is especially important when many small files * have filled up the cache and not been fired into rpcs because - * they don't pass the nr_pending/object threshhold */ - if (!list_empty(&cli->cl_cache_waiters) && + * they don't pass the nr_pending/object threshhold + */ + if (waitqueue_active(&cli->cl_cache_waiters) && !list_empty(&cli->cl_loi_write_list)) RETURN(list_to_obj(&cli->cl_loi_write_list, write_item)); diff --git a/lustre/osc/osc_internal.h b/lustre/osc/osc_internal.h index b2bc407..5a65dcd 100644 --- a/lustre/osc/osc_internal.h +++ b/lustre/osc/osc_internal.h @@ -42,7 +42,6 @@ extern atomic_t osc_pool_req_count; extern unsigned int osc_reqpool_maxreqcount; extern struct ptlrpc_request_pool *osc_rq_pool; -void osc_wake_cache_waiters(struct client_obd *cli); int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes); void osc_schedule_grant_work(void); void osc_update_next_shrink(struct client_obd *cli); diff --git a/lustre/osc/osc_page.c b/lustre/osc/osc_page.c index d0fd5e2..8f3258f 100644 --- a/lustre/osc/osc_page.c +++ b/lustre/osc/osc_page.c @@ -150,7 +150,7 @@ static int osc_page_print(const struct lu_env *env, cli->cl_r_in_flight, cli->cl_w_in_flight, cli->cl_max_rpcs_in_flight, cli->cl_avail_grant, - list_empty_marker(&cli->cl_cache_waiters), + waitqueue_active(&cli->cl_cache_waiters) ? '+' : '-', list_empty_marker(&cli->cl_loi_ready_list), list_empty_marker(&cli->cl_loi_hp_ready_list), list_empty_marker(&cli->cl_loi_write_list), -- 1.8.3.1