X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Fosc%2Fosc_cache.c;h=93cd2d6fdbab24b1f60d893cba5f55d4a150075d;hp=d3e23782555435f73ff141c11b1174c8165a5ca1;hb=1eee11c75ca13745d083410e1ced3a1a8b088ee9;hpb=39287411e32aeeed342c7da8417975e97310cd4a diff --git a/lustre/osc/osc_cache.c b/lustre/osc/osc_cache.c index d3e2378..93cd2d6 100644 --- a/lustre/osc/osc_cache.c +++ b/lustre/osc/osc_cache.c @@ -38,6 +38,7 @@ #define DEBUG_SUBSYSTEM S_OSC #include +#include #include "osc_internal.h" @@ -74,7 +75,7 @@ static inline char *ext_flags(struct osc_extent *ext, char *flags) { char *buf = flags; *buf++ = ext->oe_rw ? 'r' : 'w'; - if (ext->oe_intree) + if (!RB_EMPTY_NODE(&ext->oe_node)) *buf++ = 'i'; if (ext->oe_sync) *buf++ = 'S'; @@ -110,7 +111,7 @@ static const char *oes_strings[] = { /* ----- extent part 0 ----- */ \ __ext, EXTPARA(__ext), \ /* ----- part 1 ----- */ \ - atomic_read(&__ext->oe_refc), \ + kref_read(&__ext->oe_refc), \ atomic_read(&__ext->oe_users), \ list_empty_marker(&__ext->oe_link), \ oes_strings[__ext->oe_state], ext_flags(__ext, __buf), \ @@ -154,7 +155,7 @@ static inline struct osc_extent *next_extent(struct osc_extent *ext) if (ext == NULL) return NULL; - LASSERT(ext->oe_intree); + LASSERT(!RB_EMPTY_NODE(&ext->oe_node)); return rb_extent(rb_next(&ext->oe_node)); } @@ -163,7 +164,7 @@ static inline struct osc_extent *prev_extent(struct osc_extent *ext) if (ext == NULL) return NULL; - LASSERT(ext->oe_intree); + LASSERT(!RB_EMPTY_NODE(&ext->oe_node)); return rb_extent(rb_prev(&ext->oe_node)); } @@ -186,10 +187,10 @@ static int osc_extent_sanity_check0(struct osc_extent *ext, if (ext->oe_state >= OES_STATE_MAX) GOTO(out, rc = 10); - if (atomic_read(&ext->oe_refc) <= 0) + if (kref_read(&ext->oe_refc) <= 0) GOTO(out, rc = 20); - if (atomic_read(&ext->oe_refc) < atomic_read(&ext->oe_users)) + if (kref_read(&ext->oe_refc) < atomic_read(&ext->oe_users)) GOTO(out, rc = 30); switch (ext->oe_state) { @@ -314,8 +315,8 @@ static void osc_extent_state_set(struct osc_extent *ext, int state) /* LASSERT(sanity_check_nolock(ext) == 0); */ /* TODO: validate the state machine */ - ext->oe_state = state; - wake_up_all(&ext->oe_waitq); + smp_store_release(&ext->oe_state, state); + wake_up(&ext->oe_waitq); } static struct osc_extent *osc_extent_alloc(struct osc_object *obj) @@ -329,7 +330,7 @@ static struct osc_extent *osc_extent_alloc(struct osc_object *obj) RB_CLEAR_NODE(&ext->oe_node); ext->oe_obj = obj; cl_object_get(osc2cl(obj)); - atomic_set(&ext->oe_refc, 1); + kref_init(&ext->oe_refc); atomic_set(&ext->oe_users, 0); INIT_LIST_HEAD(&ext->oe_link); ext->oe_state = OES_INV; @@ -340,35 +341,50 @@ static struct osc_extent *osc_extent_alloc(struct osc_object *obj) return ext; } -static void osc_extent_free(struct osc_extent *ext) +static void osc_extent_free(struct kref *kref) { + struct osc_extent *ext = container_of(kref, struct osc_extent, + oe_refc); + + LASSERT(list_empty(&ext->oe_link)); + LASSERT(atomic_read(&ext->oe_users) == 0); + LASSERT(ext->oe_state == OES_INV); + LASSERT(RB_EMPTY_NODE(&ext->oe_node)); + + if (ext->oe_dlmlock) { + lu_ref_del(&ext->oe_dlmlock->l_reference, + "osc_extent", ext); + LDLM_LOCK_PUT(ext->oe_dlmlock); + ext->oe_dlmlock = NULL; + } +#if 0 + /* If/When cl_object_put drops the need for 'env', + * this code can be enabled, and matching code in + * osc_extent_put removed. + */ + cl_object_put(osc2cl(ext->oe_obj)); + OBD_SLAB_FREE_PTR(ext, osc_extent_kmem); +#endif } static struct osc_extent *osc_extent_get(struct osc_extent *ext) { - LASSERT(atomic_read(&ext->oe_refc) >= 0); - atomic_inc(&ext->oe_refc); + LASSERT(kref_read(&ext->oe_refc) >= 0); + kref_get(&ext->oe_refc); return ext; } static void osc_extent_put(const struct lu_env *env, struct osc_extent *ext) { - LASSERT(atomic_read(&ext->oe_refc) > 0); - if (atomic_dec_and_test(&ext->oe_refc)) { - LASSERT(list_empty(&ext->oe_link)); - LASSERT(atomic_read(&ext->oe_users) == 0); - LASSERT(ext->oe_state == OES_INV); - LASSERT(!ext->oe_intree); - - if (ext->oe_dlmlock != NULL) { - lu_ref_add(&ext->oe_dlmlock->l_reference, - "osc_extent", ext); - LDLM_LOCK_PUT(ext->oe_dlmlock); - ext->oe_dlmlock = NULL; - } + LASSERT(kref_read(&ext->oe_refc) > 0); + if (kref_put(&ext->oe_refc, osc_extent_free)) { + /* This should be in osc_extent_free(), but + * while we need to pass 'env' it cannot be. + */ cl_object_put(env, osc2cl(ext->oe_obj)); - osc_extent_free(ext); + + OBD_SLAB_FREE_PTR(ext, osc_extent_kmem); } } @@ -379,9 +395,9 @@ static void osc_extent_put(const struct lu_env *env, struct osc_extent *ext) */ static void osc_extent_put_trust(struct osc_extent *ext) { - LASSERT(atomic_read(&ext->oe_refc) > 1); + LASSERT(kref_read(&ext->oe_refc) > 1); assert_osc_object_is_locked(ext->oe_obj); - atomic_dec(&ext->oe_refc); + osc_extent_put(NULL, ext); } /** @@ -431,7 +447,7 @@ static void osc_extent_insert(struct osc_object *obj, struct osc_extent *ext) struct rb_node *parent = NULL; struct osc_extent *tmp; - LASSERT(ext->oe_intree == 0); + LASSERT(RB_EMPTY_NODE(&ext->oe_node)); LASSERT(ext->oe_obj == obj); assert_osc_object_is_locked(obj); while (*n != NULL) { @@ -448,7 +464,6 @@ static void osc_extent_insert(struct osc_object *obj, struct osc_extent *ext) rb_link_node(&ext->oe_node, parent, n); rb_insert_color(&ext->oe_node, &obj->oo_root); osc_extent_get(ext); - ext->oe_intree = 1; } /* caller must have held object lock. */ @@ -456,9 +471,9 @@ static void osc_extent_erase(struct osc_extent *ext) { struct osc_object *obj = ext->oe_obj; assert_osc_object_is_locked(obj); - if (ext->oe_intree) { + if (!RB_EMPTY_NODE(&ext->oe_node)) { rb_erase(&ext->oe_node, &obj->oo_root); - ext->oe_intree = 0; + RB_CLEAR_NODE(&ext->oe_node); /* rbtree held a refcount */ osc_extent_put_trust(ext); } @@ -594,7 +609,10 @@ int osc_extent_release(const struct lu_env *env, struct osc_extent *ext) if (osc_extent_merge(env, ext, next_extent(ext)) == 0) grant += cli->cl_grant_extent_tax; - if (ext->oe_urgent) + if (ext->oe_hp) + list_move_tail(&ext->oe_link, + &obj->oo_hp_exts); + else if (ext->oe_urgent) list_move_tail(&ext->oe_link, &obj->oo_urgent_exts); else if (ext->oe_nr_pages == ext->oe_mppr) { @@ -893,17 +911,6 @@ int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext, RETURN(0); } -static int extent_wait_cb(struct osc_extent *ext, enum osc_extent_state state) -{ - int ret; - - osc_object_lock(ext->oe_obj); - ret = ext->oe_state == state; - osc_object_unlock(ext->oe_obj); - - return ret; -} - /** * Wait for the extent's state to become @state. */ @@ -932,14 +939,16 @@ static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, osc_extent_release(env, ext); /* wait for the extent until its state becomes @state */ - rc = wait_event_idle_timeout(ext->oe_waitq, extent_wait_cb(ext, state), + rc = wait_event_idle_timeout(ext->oe_waitq, + smp_load_acquire(&ext->oe_state) == state, cfs_time_seconds(600)); if (rc == 0) { OSC_EXTENT_DUMP(D_ERROR, ext, "%s: wait ext to %u timedout, recovery in progress?\n", cli_name(osc_cli(obj)), state); - wait_event_idle(ext->oe_waitq, extent_wait_cb(ext, state)); + wait_event_idle(ext->oe_waitq, + smp_load_acquire(&ext->oe_state) == state); } if (ext->oe_rc < 0) rc = ext->oe_rc; @@ -1543,15 +1552,26 @@ out: return rc; } -static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw) +/* Following two inlines exist to pass code fragments + * to wait_event_idle_exclusive_timeout_cmd(). Passing + * code fragments as macro args can look confusing, so + * we provide inlines to encapsulate them. + */ +static inline void cli_unlock_and_unplug(const struct lu_env *env, + struct client_obd *cli, + struct osc_async_page *oap) { - int rc; - spin_lock(&cli->cl_loi_list_lock); - rc = list_empty(&ocw->ocw_entry); spin_unlock(&cli->cl_loi_list_lock); - return rc; + osc_io_unplug_async(env, cli, NULL); + CDEBUG(D_CACHE, + "%s: sleeping for cache space for %p\n", + cli_name(cli), oap); } +static inline void cli_lock_after_unplug(struct client_obd *cli) +{ + spin_lock(&cli->cl_loi_list_lock); +} /** * The main entry to reserve dirty page accounting. Usually the grant reserved * in this function will be freed in bulk in osc_free_grant() unless it fails @@ -1562,10 +1582,22 @@ static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw) static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, struct osc_async_page *oap, int bytes) { - struct osc_object *osc = oap->oap_obj; - struct lov_oinfo *loi = osc->oo_oinfo; - struct osc_cache_waiter ocw; - int rc = -EDQUOT; + struct osc_object *osc = oap->oap_obj; + struct lov_oinfo *loi = osc->oo_oinfo; + int rc = -EDQUOT; + int remain; + bool entered = false; + /* We cannot wait for a long time here since we are holding ldlm lock + * across the actual IO. If no requests complete fast (e.g. due to + * overloaded OST that takes a long time to process everything, we'd + * get evicted if we wait for a normal obd_timeout or some such. + * So we try to wait half the time it would take the client to be + * evicted by server which is half obd_timeout when AT is off + * or at least ldlm_enqueue_min with AT on. + * See LU-13131 */ + unsigned long timeout = cfs_time_seconds(AT_OFF ? obd_timeout / 2 : + ldlm_enqueue_min / 2); + ENTRY; OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes); @@ -1581,83 +1613,40 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, GOTO(out, rc = -EDQUOT); } - /* Hopefully normal case - cache space and write credits available */ - if (list_empty(&cli->cl_cache_waiters) && - osc_enter_cache_try(cli, oap, bytes)) { - OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n"); - GOTO(out, rc = 0); - } - - /* We can get here for two reasons: too many dirty pages in cache, or + /* + * We can wait here for two reasons: too many dirty pages in cache, or * run out of grants. In both cases we should write dirty pages out. * Adding a cache waiter will trigger urgent write-out no matter what * RPC size will be. - * The exiting condition is no avail grants and no dirty pages caching, - * that really means there is no space on the OST. */ - init_waitqueue_head(&ocw.ocw_waitq); - ocw.ocw_oap = oap; - ocw.ocw_grant = bytes; - while (cli->cl_dirty_pages > 0 || cli->cl_w_in_flight > 0) { - list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters); - ocw.ocw_rc = 0; - spin_unlock(&cli->cl_loi_list_lock); - - osc_io_unplug_async(env, cli, NULL); - - CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n", - cli_name(cli), &ocw, oap); - - rc = wait_event_idle_timeout(ocw.ocw_waitq, - ocw_granted(cli, &ocw), - cfs_time_seconds(AT_OFF ? - obd_timeout : - at_max)); - - spin_lock(&cli->cl_loi_list_lock); - - if (rc <= 0) { - /* wait_event_idle_timeout timed out */ - list_del_init(&ocw.ocw_entry); - if (rc == 0) - rc = -ETIMEDOUT; - break; - } - LASSERT(list_empty(&ocw.ocw_entry)); - rc = ocw.ocw_rc; - - if (rc != -EDQUOT) - break; - if (osc_enter_cache_try(cli, oap, bytes)) { - rc = 0; - break; - } - } - - switch (rc) { - case 0: - OSC_DUMP_GRANT(D_CACHE, cli, "finally got grant space\n"); - break; - case -ETIMEDOUT: + * The exiting condition (other than success) is no avail grants + * and no dirty pages caching, that really means there is no space + * on the OST. + */ + remain = wait_event_idle_exclusive_timeout_cmd( + cli->cl_cache_waiters, + (entered = osc_enter_cache_try(cli, oap, bytes)) || + (cli->cl_dirty_pages == 0 && cli->cl_w_in_flight == 0), + timeout, + cli_unlock_and_unplug(env, cli, oap), + cli_lock_after_unplug(cli)); + + if (entered) { + if (remain == timeout) + OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n"); + else + OSC_DUMP_GRANT(D_CACHE, cli, + "finally got grant space\n"); + wake_up(&cli->cl_cache_waiters); + rc = 0; + } else if (remain == 0) { OSC_DUMP_GRANT(D_CACHE, cli, "timeout, fall back to sync i/o\n"); osc_extent_tree_dump(D_CACHE, osc); /* fall back to synchronous I/O */ - rc = -EDQUOT; - break; - case -EINTR: - /* Ensures restartability - LU-3581 */ - OSC_DUMP_GRANT(D_CACHE, cli, "interrupted\n"); - rc = -ERESTARTSYS; - break; - case -EDQUOT: + } else { OSC_DUMP_GRANT(D_CACHE, cli, "no grant space, fall back to sync i/o\n"); - break; - default: - CDEBUG(D_CACHE, "%s: event for cache space @ %p never arrived " - "due to %d, fall back to sync i/o\n", - cli_name(cli), &ocw, rc); - break; + wake_up_all(&cli->cl_cache_waiters); } EXIT; out: @@ -1665,36 +1654,6 @@ out: RETURN(rc); } -/* caller must hold loi_list_lock */ -void osc_wake_cache_waiters(struct client_obd *cli) -{ - struct list_head *l, *tmp; - struct osc_cache_waiter *ocw; - - ENTRY; - list_for_each_safe(l, tmp, &cli->cl_cache_waiters) { - ocw = list_entry(l, struct osc_cache_waiter, ocw_entry); - - ocw->ocw_rc = -EDQUOT; - - if (osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant)) - ocw->ocw_rc = 0; - - if (ocw->ocw_rc == 0 || - !(cli->cl_dirty_pages > 0 || cli->cl_w_in_flight > 0)) { - list_del_init(&ocw->ocw_entry); - CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant " - "%ld, %d\n", ocw, ocw->ocw_oap, - cli->cl_avail_grant, ocw->ocw_rc); - - wake_up(&ocw->ocw_waitq); - } - } - - EXIT; -} -EXPORT_SYMBOL(osc_wake_cache_waiters); - static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc) { int hprpc = !!list_empty(&osc->oo_hp_exts); @@ -1734,8 +1693,9 @@ static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc, } /* trigger a write rpc stream as long as there are dirtiers * waiting for space. as they're waiting, they're not going to - * create more pages to coalesce with what's waiting.. */ - if (!list_empty(&cli->cl_cache_waiters)) { + * create more pages to coalesce with what's waiting.. + */ + if (waitqueue_active(&cli->cl_cache_waiters)) { CDEBUG(D_CACHE, "cache waiters forcing RPC\n"); RETURN(1); } @@ -2035,19 +1995,16 @@ static unsigned int get_write_extents(struct osc_object *obj, if (data.erd_page_count == data.erd_max_pages) return data.erd_page_count; - ext = first_extent(obj); - while (ext != NULL) { + for (ext = first_extent(obj); + ext; + ext = next_extent(ext)) { if ((ext->oe_state != OES_CACHE) || /* this extent may be already in current rpclist */ - (!list_empty(&ext->oe_link) && ext->oe_owner != NULL)) { - ext = next_extent(ext); + (!list_empty(&ext->oe_link) && ext->oe_owner)) continue; - } if (!try_to_add_extent_for_io(cli, ext, &data)) return data.erd_page_count; - - ext = next_extent(ext); } return data.erd_page_count; } @@ -2190,8 +2147,9 @@ static struct osc_object *osc_next_obj(struct client_obd *cli) /* then if we have cache waiters, return all objects with queued * writes. This is especially important when many small files * have filled up the cache and not been fired into rpcs because - * they don't pass the nr_pending/object threshhold */ - if (!list_empty(&cli->cl_cache_waiters) && + * they don't pass the nr_pending/object threshhold + */ + if (waitqueue_active(&cli->cl_cache_waiters) && !list_empty(&cli->cl_loi_write_list)) RETURN(list_to_obj(&cli->cl_loi_write_list, write_item)); @@ -2222,7 +2180,12 @@ __must_hold(&cli->cl_loi_list_lock) OSC_IO_DEBUG(osc, "%lu in flight\n", rpcs_in_flight(cli)); - if (osc_max_rpc_in_flight(cli, osc)) { + /* even if we have reached our max in flight RPCs, we still + * allow all high-priority RPCs through to prevent their + * starvation and leading to server evicting us for not + * writing out pages in a timely manner LU-13131 */ + if (osc_max_rpc_in_flight(cli, osc) && + list_empty(&osc->oo_hp_exts)) { __osc_list_maint(cli, osc); break; } @@ -2599,7 +2562,7 @@ int osc_flush_async_page(const struct lu_env *env, struct cl_io *io, oap->oap_async_flags |= ASYNC_READY|ASYNC_URGENT; spin_unlock(&oap->oap_lock); - if (memory_pressure_get()) + if (current->flags & PF_MEMALLOC) ext->oe_memalloc = 1; ext->oe_urgent = 1; @@ -2945,7 +2908,7 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, EASSERT(!ext->oe_hp, ext); ext->oe_hp = 1; list = &obj->oo_hp_exts; - } else if (!ext->oe_urgent) { + } else if (!ext->oe_urgent && !ext->oe_hp) { ext->oe_urgent = 1; list = &obj->oo_urgent_exts; }