X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fosc%2Fosc_cache.c;h=93cd2d6fdbab24b1f60d893cba5f55d4a150075d;hb=1eee11c75ca13745d083410e1ced3a1a8b088ee9;hp=3e179c1464d8a54c25ffb2c2056917ea6c8ea8f7;hpb=c1cab789aaa25bbb4062208aeb2822fde3007cd4;p=fs%2Flustre-release.git diff --git a/lustre/osc/osc_cache.c b/lustre/osc/osc_cache.c index 3e179c1..93cd2d6 100644 --- a/lustre/osc/osc_cache.c +++ b/lustre/osc/osc_cache.c @@ -38,6 +38,7 @@ #define DEBUG_SUBSYSTEM S_OSC #include +#include #include "osc_internal.h" @@ -74,7 +75,7 @@ static inline char *ext_flags(struct osc_extent *ext, char *flags) { char *buf = flags; *buf++ = ext->oe_rw ? 'r' : 'w'; - if (ext->oe_intree) + if (!RB_EMPTY_NODE(&ext->oe_node)) *buf++ = 'i'; if (ext->oe_sync) *buf++ = 'S'; @@ -94,11 +95,6 @@ static inline char *ext_flags(struct osc_extent *ext, char *flags) return flags; } -static inline char list_empty_marker(struct list_head *list) -{ - return list_empty(list) ? '-' : '+'; -} - #define EXTSTR "[%lu -> %lu/%lu]" #define EXTPARA(ext) (ext)->oe_start, (ext)->oe_end, (ext)->oe_max_end static const char *oes_strings[] = { @@ -115,7 +111,7 @@ static const char *oes_strings[] = { /* ----- extent part 0 ----- */ \ __ext, EXTPARA(__ext), \ /* ----- part 1 ----- */ \ - atomic_read(&__ext->oe_refc), \ + kref_read(&__ext->oe_refc), \ atomic_read(&__ext->oe_users), \ list_empty_marker(&__ext->oe_link), \ oes_strings[__ext->oe_state], ext_flags(__ext, __buf), \ @@ -151,10 +147,7 @@ static const char *oes_strings[] = { static inline struct osc_extent *rb_extent(struct rb_node *n) { - if (n == NULL) - return NULL; - - return container_of(n, struct osc_extent, oe_node); + return rb_entry_safe(n, struct osc_extent, oe_node); } static inline struct osc_extent *next_extent(struct osc_extent *ext) @@ -162,7 +155,7 @@ static inline struct osc_extent *next_extent(struct osc_extent *ext) if (ext == NULL) return NULL; - LASSERT(ext->oe_intree); + LASSERT(!RB_EMPTY_NODE(&ext->oe_node)); return rb_extent(rb_next(&ext->oe_node)); } @@ -171,7 +164,7 @@ static inline struct osc_extent *prev_extent(struct osc_extent *ext) if (ext == NULL) return NULL; - LASSERT(ext->oe_intree); + LASSERT(!RB_EMPTY_NODE(&ext->oe_node)); return rb_extent(rb_prev(&ext->oe_node)); } @@ -189,16 +182,15 @@ static int osc_extent_sanity_check0(struct osc_extent *ext, size_t page_count; int rc = 0; - if (!osc_object_is_locked(obj)) - GOTO(out, rc = 9); + assert_osc_object_is_locked(obj); if (ext->oe_state >= OES_STATE_MAX) GOTO(out, rc = 10); - if (atomic_read(&ext->oe_refc) <= 0) + if (kref_read(&ext->oe_refc) <= 0) GOTO(out, rc = 20); - if (atomic_read(&ext->oe_refc) < atomic_read(&ext->oe_users)) + if (kref_read(&ext->oe_refc) < atomic_read(&ext->oe_users)) GOTO(out, rc = 30); switch (ext->oe_state) { @@ -220,6 +212,7 @@ static int osc_extent_sanity_check0(struct osc_extent *ext, GOTO(out, rc = 60); if (ext->oe_fsync_wait && !ext->oe_urgent && !ext->oe_hp) GOTO(out, rc = 65); + /* fallthrough */ default: if (atomic_read(&ext->oe_users) > 0) GOTO(out, rc = 70); @@ -285,6 +278,11 @@ out: __res; \ }) +static inline bool +overlapped(const struct osc_extent *ex1, const struct osc_extent *ex2) +{ + return !(ex1->oe_end < ex2->oe_start || ex2->oe_end < ex1->oe_start); +} /** * sanity check - to make sure there is no overlapped extent in the tree. @@ -294,7 +292,7 @@ static int osc_extent_is_overlapped(struct osc_object *obj, { struct osc_extent *tmp; - LASSERT(osc_object_is_locked(obj)); + assert_osc_object_is_locked(obj); if (!extent_debug) return 0; @@ -302,8 +300,7 @@ static int osc_extent_is_overlapped(struct osc_object *obj, for (tmp = first_extent(obj); tmp != NULL; tmp = next_extent(tmp)) { if (tmp == ext) continue; - if (tmp->oe_end >= ext->oe_start && - tmp->oe_start <= ext->oe_end) + if (overlapped(tmp, ext)) return 1; } return 0; @@ -311,15 +308,15 @@ static int osc_extent_is_overlapped(struct osc_object *obj, static void osc_extent_state_set(struct osc_extent *ext, int state) { - LASSERT(osc_object_is_locked(ext->oe_obj)); + assert_osc_object_is_locked(ext->oe_obj); LASSERT(state >= OES_INV && state < OES_STATE_MAX); /* Never try to sanity check a state changing extent :-) */ /* LASSERT(sanity_check_nolock(ext) == 0); */ /* TODO: validate the state machine */ - ext->oe_state = state; - wake_up_all(&ext->oe_waitq); + smp_store_release(&ext->oe_state, state); + wake_up(&ext->oe_waitq); } static struct osc_extent *osc_extent_alloc(struct osc_object *obj) @@ -333,7 +330,7 @@ static struct osc_extent *osc_extent_alloc(struct osc_object *obj) RB_CLEAR_NODE(&ext->oe_node); ext->oe_obj = obj; cl_object_get(osc2cl(obj)); - atomic_set(&ext->oe_refc, 1); + kref_init(&ext->oe_refc); atomic_set(&ext->oe_users, 0); INIT_LIST_HEAD(&ext->oe_link); ext->oe_state = OES_INV; @@ -344,35 +341,50 @@ static struct osc_extent *osc_extent_alloc(struct osc_object *obj) return ext; } -static void osc_extent_free(struct osc_extent *ext) +static void osc_extent_free(struct kref *kref) { + struct osc_extent *ext = container_of(kref, struct osc_extent, + oe_refc); + + LASSERT(list_empty(&ext->oe_link)); + LASSERT(atomic_read(&ext->oe_users) == 0); + LASSERT(ext->oe_state == OES_INV); + LASSERT(RB_EMPTY_NODE(&ext->oe_node)); + + if (ext->oe_dlmlock) { + lu_ref_del(&ext->oe_dlmlock->l_reference, + "osc_extent", ext); + LDLM_LOCK_PUT(ext->oe_dlmlock); + ext->oe_dlmlock = NULL; + } +#if 0 + /* If/When cl_object_put drops the need for 'env', + * this code can be enabled, and matching code in + * osc_extent_put removed. + */ + cl_object_put(osc2cl(ext->oe_obj)); + OBD_SLAB_FREE_PTR(ext, osc_extent_kmem); +#endif } static struct osc_extent *osc_extent_get(struct osc_extent *ext) { - LASSERT(atomic_read(&ext->oe_refc) >= 0); - atomic_inc(&ext->oe_refc); + LASSERT(kref_read(&ext->oe_refc) >= 0); + kref_get(&ext->oe_refc); return ext; } static void osc_extent_put(const struct lu_env *env, struct osc_extent *ext) { - LASSERT(atomic_read(&ext->oe_refc) > 0); - if (atomic_dec_and_test(&ext->oe_refc)) { - LASSERT(list_empty(&ext->oe_link)); - LASSERT(atomic_read(&ext->oe_users) == 0); - LASSERT(ext->oe_state == OES_INV); - LASSERT(!ext->oe_intree); - - if (ext->oe_dlmlock != NULL) { - lu_ref_add(&ext->oe_dlmlock->l_reference, - "osc_extent", ext); - LDLM_LOCK_PUT(ext->oe_dlmlock); - ext->oe_dlmlock = NULL; - } + LASSERT(kref_read(&ext->oe_refc) > 0); + if (kref_put(&ext->oe_refc, osc_extent_free)) { + /* This should be in osc_extent_free(), but + * while we need to pass 'env' it cannot be. + */ cl_object_put(env, osc2cl(ext->oe_obj)); - osc_extent_free(ext); + + OBD_SLAB_FREE_PTR(ext, osc_extent_kmem); } } @@ -383,9 +395,9 @@ static void osc_extent_put(const struct lu_env *env, struct osc_extent *ext) */ static void osc_extent_put_trust(struct osc_extent *ext) { - LASSERT(atomic_read(&ext->oe_refc) > 1); - LASSERT(osc_object_is_locked(ext->oe_obj)); - atomic_dec(&ext->oe_refc); + LASSERT(kref_read(&ext->oe_refc) > 1); + assert_osc_object_is_locked(ext->oe_obj); + osc_extent_put(NULL, ext); } /** @@ -398,7 +410,7 @@ static struct osc_extent *osc_extent_search(struct osc_object *obj, struct rb_node *n = obj->oo_root.rb_node; struct osc_extent *tmp, *p = NULL; - LASSERT(osc_object_is_locked(obj)); + assert_osc_object_is_locked(obj); while (n != NULL) { tmp = rb_extent(n); if (index < tmp->oe_start) { @@ -435,9 +447,9 @@ static void osc_extent_insert(struct osc_object *obj, struct osc_extent *ext) struct rb_node *parent = NULL; struct osc_extent *tmp; - LASSERT(ext->oe_intree == 0); + LASSERT(RB_EMPTY_NODE(&ext->oe_node)); LASSERT(ext->oe_obj == obj); - LASSERT(osc_object_is_locked(obj)); + assert_osc_object_is_locked(obj); while (*n != NULL) { tmp = rb_extent(*n); parent = *n; @@ -452,17 +464,16 @@ static void osc_extent_insert(struct osc_object *obj, struct osc_extent *ext) rb_link_node(&ext->oe_node, parent, n); rb_insert_color(&ext->oe_node, &obj->oo_root); osc_extent_get(ext); - ext->oe_intree = 1; } /* caller must have held object lock. */ static void osc_extent_erase(struct osc_extent *ext) { struct osc_object *obj = ext->oe_obj; - LASSERT(osc_object_is_locked(obj)); - if (ext->oe_intree) { + assert_osc_object_is_locked(obj); + if (!RB_EMPTY_NODE(&ext->oe_node)) { rb_erase(&ext->oe_node, &obj->oo_root); - ext->oe_intree = 0; + RB_CLEAR_NODE(&ext->oe_node); /* rbtree held a refcount */ osc_extent_put_trust(ext); } @@ -472,7 +483,7 @@ static struct osc_extent *osc_extent_hold(struct osc_extent *ext) { struct osc_object *obj = ext->oe_obj; - LASSERT(osc_object_is_locked(obj)); + assert_osc_object_is_locked(obj); LASSERT(ext->oe_state == OES_ACTIVE || ext->oe_state == OES_CACHE); if (ext->oe_state == OES_CACHE) { osc_extent_state_set(ext, OES_ACTIVE); @@ -485,7 +496,7 @@ static struct osc_extent *osc_extent_hold(struct osc_extent *ext) static void __osc_extent_remove(struct osc_extent *ext) { - LASSERT(osc_object_is_locked(ext->oe_obj)); + assert_osc_object_is_locked(ext->oe_obj); LASSERT(list_empty(&ext->oe_pages)); osc_extent_erase(ext); list_del_init(&ext->oe_link); @@ -516,7 +527,7 @@ static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur, int ppc_bits; LASSERT(cur->oe_state == OES_CACHE); - LASSERT(osc_object_is_locked(obj)); + assert_osc_object_is_locked(obj); if (victim == NULL) return -EINVAL; @@ -584,6 +595,7 @@ int osc_extent_release(const struct lu_env *env, struct osc_extent *ext) * osc_cache_truncate_start(). */ osc_extent_state_set(ext, OES_TRUNC); ext->oe_trunc_pending = 0; + osc_object_unlock(obj); } else { int grant = 0; @@ -596,18 +608,21 @@ int osc_extent_release(const struct lu_env *env, struct osc_extent *ext) grant += cli->cl_grant_extent_tax; if (osc_extent_merge(env, ext, next_extent(ext)) == 0) grant += cli->cl_grant_extent_tax; - if (grant > 0) - osc_unreserve_grant(cli, 0, grant); - if (ext->oe_urgent) + if (ext->oe_hp) + list_move_tail(&ext->oe_link, + &obj->oo_hp_exts); + else if (ext->oe_urgent) list_move_tail(&ext->oe_link, &obj->oo_urgent_exts); else if (ext->oe_nr_pages == ext->oe_mppr) { list_move_tail(&ext->oe_link, &obj->oo_full_exts); } + osc_object_unlock(obj); + if (grant > 0) + osc_unreserve_grant(cli, 0, grant); } - osc_object_unlock(obj); osc_io_unplug_async(env, cli, obj); } @@ -615,11 +630,6 @@ int osc_extent_release(const struct lu_env *env, struct osc_extent *ext) RETURN(rc); } -static inline int overlapped(struct osc_extent *ex1, struct osc_extent *ex2) -{ - return !(ex1->oe_end < ex2->oe_start || ex2->oe_end < ex1->oe_start); -} - /** * Find or create an extent which includes @index, core function to manage * extent tree. @@ -697,11 +707,11 @@ static struct osc_extent *osc_extent_find(const struct lu_env *env, restart: osc_object_lock(obj); ext = osc_extent_search(obj, cur->oe_start); - if (ext == NULL) + if (!ext) ext = first_extent(obj); - while (ext != NULL) { + for (; ext; ext = next_extent(ext)) { pgoff_t ext_chk_start = ext->oe_start >> ppc_bits; - pgoff_t ext_chk_end = ext->oe_end >> ppc_bits; + pgoff_t ext_chk_end = ext->oe_end >> ppc_bits; LASSERT(sanity_check_nolock(ext) == 0); if (chunk > ext_chk_end + 1 || chunk < ext_chk_start) @@ -712,15 +722,12 @@ restart: EASSERTF(!overlapped(ext, cur), ext, EXTSTR"\n", EXTPARA(cur)); - ext = next_extent(ext); continue; } /* discontiguous chunks? */ - if (chunk + 1 < ext_chk_start) { - ext = next_extent(ext); + if (chunk + 1 < ext_chk_start) continue; - } /* ok, from now on, ext and cur have these attrs: * 1. covered by the same lock @@ -745,30 +752,24 @@ restart: } /* non-overlapped extent */ - if (ext->oe_state != OES_CACHE || ext->oe_fsync_wait) { + if (ext->oe_state != OES_CACHE || ext->oe_fsync_wait) /* we can't do anything for a non OES_CACHE extent, or * if there is someone waiting for this extent to be * flushed, try next one. */ - ext = next_extent(ext); continue; - } /* check if they belong to the same rpc slot before trying to * merge. the extents are not overlapped and contiguous at * chunk level to get here. */ - if (ext->oe_max_end != max_end) { + if (ext->oe_max_end != max_end) /* if they don't belong to the same RPC slot or * max_pages_per_rpc has ever changed, do not merge. */ - ext = next_extent(ext); continue; - } /* check whether maximum extent size will be hit */ if ((ext_chk_end - ext_chk_start + 1 + 1) << ppc_bits > - cli->cl_max_extent_pages) { - ext = next_extent(ext); + cli->cl_max_extent_pages) continue; - } /* it's required that an extent must be contiguous at chunk * level so that we know the whole extent is covered by grant @@ -805,8 +806,6 @@ restart: } if (found != NULL) break; - - ext = next_extent(ext); } osc_extent_tree_dump(D_CACHE, obj); @@ -912,17 +911,6 @@ int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext, RETURN(0); } -static int extent_wait_cb(struct osc_extent *ext, enum osc_extent_state state) -{ - int ret; - - osc_object_lock(ext->oe_obj); - ret = ext->oe_state == state; - osc_object_unlock(ext->oe_obj); - - return ret; -} - /** * Wait for the extent's state to become @state. */ @@ -930,8 +918,6 @@ static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, enum osc_extent_state state) { struct osc_object *obj = ext->oe_obj; - struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(600), NULL, - LWI_ON_SIGNAL_NOOP, NULL); int rc = 0; ENTRY; @@ -953,18 +939,21 @@ static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, osc_extent_release(env, ext); /* wait for the extent until its state becomes @state */ - rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), &lwi); - if (rc == -ETIMEDOUT) { + rc = wait_event_idle_timeout(ext->oe_waitq, + smp_load_acquire(&ext->oe_state) == state, + cfs_time_seconds(600)); + if (rc == 0) { OSC_EXTENT_DUMP(D_ERROR, ext, "%s: wait ext to %u timedout, recovery in progress?\n", cli_name(osc_cli(obj)), state); - lwi = LWI_INTR(NULL, NULL); - rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), - &lwi); + wait_event_idle(ext->oe_waitq, + smp_load_acquire(&ext->oe_state) == state); } - if (rc == 0 && ext->oe_rc < 0) + if (ext->oe_rc < 0) rc = ext->oe_rc; + else + rc = 0; RETURN(rc); } @@ -1326,7 +1315,7 @@ static int osc_refresh_count(const struct lu_env *env, return 0; else if (cl_offset(obj, index + 1) > kms) /* catch sub-page write at end of file */ - return kms % PAGE_SIZE; + return kms & ~PAGE_MASK; else return PAGE_SIZE; } @@ -1426,11 +1415,6 @@ static void osc_release_write_grant(struct client_obd *cli, pga->flag &= ~OBD_BRW_FROM_GRANT; atomic_long_dec(&obd_dirty_pages); cli->cl_dirty_pages--; - if (pga->flag & OBD_BRW_NOCACHE) { - pga->flag &= ~OBD_BRW_NOCACHE; - atomic_long_dec(&obd_dirty_transit_pages); - cli->cl_dirty_transit--; - } EXIT; } @@ -1470,13 +1454,20 @@ static void __osc_unreserve_grant(struct client_obd *cli, } } -static void osc_unreserve_grant(struct client_obd *cli, - unsigned int reserved, unsigned int unused) +static void osc_unreserve_grant_nolock(struct client_obd *cli, + unsigned int reserved, + unsigned int unused) { - spin_lock(&cli->cl_loi_list_lock); __osc_unreserve_grant(cli, reserved, unused); if (unused > 0) osc_wake_cache_waiters(cli); +} + +static void osc_unreserve_grant(struct client_obd *cli, + unsigned int reserved, unsigned int unused) +{ + spin_lock(&cli->cl_loi_list_lock); + osc_unreserve_grant_nolock(cli, reserved, unused); spin_unlock(&cli->cl_loi_list_lock); } @@ -1536,7 +1527,7 @@ static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap) */ static int osc_enter_cache_try(struct client_obd *cli, struct osc_async_page *oap, - int bytes, int transient) + int bytes) { int rc; @@ -1550,11 +1541,6 @@ static int osc_enter_cache_try(struct client_obd *cli, if (atomic_long_add_return(1, &obd_dirty_pages) <= obd_max_dirty_pages) { osc_consume_write_grant(cli, &oap->oap_brw_page); - if (transient) { - cli->cl_dirty_transit++; - atomic_long_inc(&obd_dirty_transit_pages); - oap->oap_brw_flags |= OBD_BRW_NOCACHE; - } rc = 1; goto out; } else @@ -1566,15 +1552,26 @@ out: return rc; } -static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw) +/* Following two inlines exist to pass code fragments + * to wait_event_idle_exclusive_timeout_cmd(). Passing + * code fragments as macro args can look confusing, so + * we provide inlines to encapsulate them. + */ +static inline void cli_unlock_and_unplug(const struct lu_env *env, + struct client_obd *cli, + struct osc_async_page *oap) { - int rc; - spin_lock(&cli->cl_loi_list_lock); - rc = list_empty(&ocw->ocw_entry); spin_unlock(&cli->cl_loi_list_lock); - return rc; + osc_io_unplug_async(env, cli, NULL); + CDEBUG(D_CACHE, + "%s: sleeping for cache space for %p\n", + cli_name(cli), oap); } +static inline void cli_lock_after_unplug(struct client_obd *cli) +{ + spin_lock(&cli->cl_loi_list_lock); +} /** * The main entry to reserve dirty page accounting. Usually the grant reserved * in this function will be freed in bulk in osc_free_grant() unless it fails @@ -1585,15 +1582,23 @@ static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw) static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, struct osc_async_page *oap, int bytes) { - struct osc_object *osc = oap->oap_obj; - struct lov_oinfo *loi = osc->oo_oinfo; - struct osc_cache_waiter ocw; - struct l_wait_info lwi; - int rc = -EDQUOT; - ENTRY; + struct osc_object *osc = oap->oap_obj; + struct lov_oinfo *loi = osc->oo_oinfo; + int rc = -EDQUOT; + int remain; + bool entered = false; + /* We cannot wait for a long time here since we are holding ldlm lock + * across the actual IO. If no requests complete fast (e.g. due to + * overloaded OST that takes a long time to process everything, we'd + * get evicted if we wait for a normal obd_timeout or some such. + * So we try to wait half the time it would take the client to be + * evicted by server which is half obd_timeout when AT is off + * or at least ldlm_enqueue_min with AT on. + * See LU-13131 */ + unsigned long timeout = cfs_time_seconds(AT_OFF ? obd_timeout / 2 : + ldlm_enqueue_min / 2); - lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(AT_OFF ? obd_timeout : at_max), - NULL, LWI_ON_SIGNAL_NOOP, NULL); + ENTRY; OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes); @@ -1608,77 +1613,40 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, GOTO(out, rc = -EDQUOT); } - /* Hopefully normal case - cache space and write credits available */ - if (list_empty(&cli->cl_cache_waiters) && - osc_enter_cache_try(cli, oap, bytes, 0)) { - OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n"); - GOTO(out, rc = 0); - } - - /* We can get here for two reasons: too many dirty pages in cache, or + /* + * We can wait here for two reasons: too many dirty pages in cache, or * run out of grants. In both cases we should write dirty pages out. * Adding a cache waiter will trigger urgent write-out no matter what * RPC size will be. - * The exiting condition is no avail grants and no dirty pages caching, - * that really means there is no space on the OST. */ - init_waitqueue_head(&ocw.ocw_waitq); - ocw.ocw_oap = oap; - ocw.ocw_grant = bytes; - while (cli->cl_dirty_pages > 0 || cli->cl_w_in_flight > 0) { - list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters); - ocw.ocw_rc = 0; - spin_unlock(&cli->cl_loi_list_lock); - - osc_io_unplug_async(env, cli, NULL); - - CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n", - cli_name(cli), &ocw, oap); - - rc = l_wait_event(ocw.ocw_waitq, ocw_granted(cli, &ocw), &lwi); - - spin_lock(&cli->cl_loi_list_lock); - - if (rc < 0) { - /* l_wait_event is interrupted by signal or timed out */ - list_del_init(&ocw.ocw_entry); - break; - } - LASSERT(list_empty(&ocw.ocw_entry)); - rc = ocw.ocw_rc; - - if (rc != -EDQUOT) - break; - if (osc_enter_cache_try(cli, oap, bytes, 0)) { - rc = 0; - break; - } - } - - switch (rc) { - case 0: - OSC_DUMP_GRANT(D_CACHE, cli, "finally got grant space\n"); - break; - case -ETIMEDOUT: + * The exiting condition (other than success) is no avail grants + * and no dirty pages caching, that really means there is no space + * on the OST. + */ + remain = wait_event_idle_exclusive_timeout_cmd( + cli->cl_cache_waiters, + (entered = osc_enter_cache_try(cli, oap, bytes)) || + (cli->cl_dirty_pages == 0 && cli->cl_w_in_flight == 0), + timeout, + cli_unlock_and_unplug(env, cli, oap), + cli_lock_after_unplug(cli)); + + if (entered) { + if (remain == timeout) + OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n"); + else + OSC_DUMP_GRANT(D_CACHE, cli, + "finally got grant space\n"); + wake_up(&cli->cl_cache_waiters); + rc = 0; + } else if (remain == 0) { OSC_DUMP_GRANT(D_CACHE, cli, "timeout, fall back to sync i/o\n"); osc_extent_tree_dump(D_CACHE, osc); /* fall back to synchronous I/O */ - rc = -EDQUOT; - break; - case -EINTR: - /* Ensures restartability - LU-3581 */ - OSC_DUMP_GRANT(D_CACHE, cli, "interrupted\n"); - rc = -ERESTARTSYS; - break; - case -EDQUOT: + } else { OSC_DUMP_GRANT(D_CACHE, cli, "no grant space, fall back to sync i/o\n"); - break; - default: - CDEBUG(D_CACHE, "%s: event for cache space @ %p never arrived " - "due to %d, fall back to sync i/o\n", - cli_name(cli), &ocw, rc); - break; + wake_up_all(&cli->cl_cache_waiters); } EXIT; out: @@ -1686,36 +1654,6 @@ out: RETURN(rc); } -/* caller must hold loi_list_lock */ -void osc_wake_cache_waiters(struct client_obd *cli) -{ - struct list_head *l, *tmp; - struct osc_cache_waiter *ocw; - - ENTRY; - list_for_each_safe(l, tmp, &cli->cl_cache_waiters) { - ocw = list_entry(l, struct osc_cache_waiter, ocw_entry); - - ocw->ocw_rc = -EDQUOT; - - if (osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0)) - ocw->ocw_rc = 0; - - if (ocw->ocw_rc == 0 || - !(cli->cl_dirty_pages > 0 || cli->cl_w_in_flight > 0)) { - list_del_init(&ocw->ocw_entry); - CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant " - "%ld, %d\n", ocw, ocw->ocw_oap, - cli->cl_avail_grant, ocw->ocw_rc); - - wake_up(&ocw->ocw_waitq); - } - } - - EXIT; -} -EXPORT_SYMBOL(osc_wake_cache_waiters); - static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc) { int hprpc = !!list_empty(&osc->oo_hp_exts); @@ -1755,8 +1693,9 @@ static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc, } /* trigger a write rpc stream as long as there are dirtiers * waiting for space. as they're waiting, they're not going to - * create more pages to coalesce with what's waiting.. */ - if (!list_empty(&cli->cl_cache_waiters)) { + * create more pages to coalesce with what's waiting.. + */ + if (waitqueue_active(&cli->cl_cache_waiters)) { CDEBUG(D_CACHE, "cache waiters forcing RPC\n"); RETURN(1); } @@ -1885,7 +1824,6 @@ static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, spin_lock(&oap->oap_lock); oap->oap_async_flags = 0; spin_unlock(&oap->oap_lock); - oap->oap_interrupted = 0; if (oap->oap_cmd & OBD_BRW_WRITE && xid > 0) { spin_lock(&cli->cl_loi_list_lock); @@ -1918,6 +1856,31 @@ static inline unsigned osc_extent_chunks(const struct osc_extent *ext) return (ext->oe_end >> ppc_bits) - (ext->oe_start >> ppc_bits) + 1; } +static inline bool +can_merge(const struct osc_extent *ext, const struct osc_extent *in_rpc) +{ + if (ext->oe_no_merge || in_rpc->oe_no_merge) + return false; + + if (ext->oe_srvlock != in_rpc->oe_srvlock) + return false; + + if (ext->oe_ndelay != in_rpc->oe_ndelay) + return false; + + if (!ext->oe_grants != !in_rpc->oe_grants) + return false; + + if (ext->oe_dio != in_rpc->oe_dio) + return false; + + /* It's possible to have overlap on DIO */ + if (in_rpc->oe_dio && overlapped(ext, in_rpc)) + return false; + + return true; +} + /** * Try to add extent to one RPC. We need to think about the following things: * - # of pages must not be over max_pages_per_rpc @@ -1929,9 +1892,6 @@ static int try_to_add_extent_for_io(struct client_obd *cli, { struct osc_extent *tmp; unsigned int chunk_count; - struct osc_async_page *oap = list_first_entry(&ext->oe_pages, - struct osc_async_page, - oap_pending_item); ENTRY; EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE), @@ -1960,30 +1920,10 @@ static int try_to_add_extent_for_io(struct client_obd *cli, RETURN(0); list_for_each_entry(tmp, data->erd_rpc_list, oe_link) { - struct osc_async_page *oap2; - oap2 = list_first_entry(&tmp->oe_pages, struct osc_async_page, - oap_pending_item); EASSERT(tmp->oe_owner == current, tmp); -#if 0 - if (overlapped(tmp, ext)) { - OSC_EXTENT_DUMP(D_ERROR, tmp, "overlapped %p.\n", ext); - EASSERT(0, ext); - } -#endif - if (oap2cl_page(oap)->cp_type != oap2cl_page(oap2)->cp_type) { - CDEBUG(D_CACHE, "Do not permit different types of IO " - "in one RPC\n"); - RETURN(0); - } - if (tmp->oe_srvlock != ext->oe_srvlock || - !tmp->oe_grants != !ext->oe_grants || - tmp->oe_ndelay != ext->oe_ndelay || - tmp->oe_no_merge || ext->oe_no_merge) + if (!can_merge(ext, tmp)) RETURN(0); - - /* remove break for strict check */ - break; } data->erd_max_extents--; @@ -2020,7 +1960,7 @@ static unsigned int get_write_extents(struct osc_object *obj, .erd_max_extents = 256, }; - LASSERT(osc_object_is_locked(obj)); + assert_osc_object_is_locked(obj); while (!list_empty(&obj->oo_hp_exts)) { ext = list_entry(obj->oo_hp_exts.next, struct osc_extent, oe_link); @@ -2055,19 +1995,16 @@ static unsigned int get_write_extents(struct osc_object *obj, if (data.erd_page_count == data.erd_max_pages) return data.erd_page_count; - ext = first_extent(obj); - while (ext != NULL) { + for (ext = first_extent(obj); + ext; + ext = next_extent(ext)) { if ((ext->oe_state != OES_CACHE) || /* this extent may be already in current rpclist */ - (!list_empty(&ext->oe_link) && ext->oe_owner != NULL)) { - ext = next_extent(ext); + (!list_empty(&ext->oe_link) && ext->oe_owner)) continue; - } if (!try_to_add_extent_for_io(cli, ext, &data)) return data.erd_page_count; - - ext = next_extent(ext); } return data.erd_page_count; } @@ -2077,7 +2014,7 @@ osc_send_write_rpc(const struct lu_env *env, struct client_obd *cli, struct osc_object *osc) __must_hold(osc) { - struct list_head rpclist = LIST_HEAD_INIT(rpclist); + LIST_HEAD(rpclist); struct osc_extent *ext; struct osc_extent *tmp; struct osc_extent *first = NULL; @@ -2086,7 +2023,7 @@ __must_hold(osc) int rc = 0; ENTRY; - LASSERT(osc_object_is_locked(osc)); + assert_osc_object_is_locked(osc); page_count = get_write_extents(osc, &rpclist); LASSERT(equi(page_count == 0, list_empty(&rpclist))); @@ -2153,7 +2090,7 @@ __must_hold(osc) { struct osc_extent *ext; struct osc_extent *next; - struct list_head rpclist = LIST_HEAD_INIT(rpclist); + LIST_HEAD(rpclist); struct extent_rpc_data data = { .erd_rpc_list = &rpclist, .erd_page_count = 0, @@ -2164,7 +2101,7 @@ __must_hold(osc) int rc = 0; ENTRY; - LASSERT(osc_object_is_locked(osc)); + assert_osc_object_is_locked(osc); list_for_each_entry_safe(ext, next, &osc->oo_reading_exts, oe_link) { EASSERT(ext->oe_state == OES_LOCK_DONE, ext); if (!try_to_add_extent_for_io(cli, ext, &data)) @@ -2210,8 +2147,9 @@ static struct osc_object *osc_next_obj(struct client_obd *cli) /* then if we have cache waiters, return all objects with queued * writes. This is especially important when many small files * have filled up the cache and not been fired into rpcs because - * they don't pass the nr_pending/object threshhold */ - if (!list_empty(&cli->cl_cache_waiters) && + * they don't pass the nr_pending/object threshhold + */ + if (waitqueue_active(&cli->cl_cache_waiters) && !list_empty(&cli->cl_loi_write_list)) RETURN(list_to_obj(&cli->cl_loi_write_list, write_item)); @@ -2242,7 +2180,12 @@ __must_hold(&cli->cl_loi_list_lock) OSC_IO_DEBUG(osc, "%lu in flight\n", rpcs_in_flight(cli)); - if (osc_max_rpc_in_flight(cli, osc)) { + /* even if we have reached our max in flight RPCs, we still + * allow all high-priority RPCs through to prevent their + * starvation and leading to server evicting us for not + * writing out pages in a timely manner LU-13131 */ + if (osc_max_rpc_in_flight(cli, osc) && + list_empty(&osc->oo_hp_exts)) { __osc_list_maint(cli, osc); break; } @@ -2347,13 +2290,14 @@ int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops, EXPORT_SYMBOL(osc_prep_async_page); int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, - struct osc_page *ops) + struct osc_page *ops, cl_commit_cbt cb) { struct osc_io *oio = osc_env_io(env); struct osc_extent *ext = NULL; struct osc_async_page *oap = &ops->ops_oap; struct client_obd *cli = oap->oap_cli; struct osc_object *osc = oap->oap_obj; + struct pagevec *pvec = &osc_env_info(env)->oti_pagevec; pgoff_t index; unsigned int tmp; unsigned int grants = 0; @@ -2375,7 +2319,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, /* Set the OBD_BRW_SRVLOCK before the page is queued. */ brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0; - if (oio->oi_cap_sys_resource) { + if (oio->oi_cap_sys_resource || io->ci_noquota) { brw_flags |= OBD_BRW_NOQUOTA; cmd |= OBD_BRW_NOQUOTA; } @@ -2396,7 +2340,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, qid[USRQUOTA] = attr->cat_uid; qid[GRPQUOTA] = attr->cat_gid; qid[PRJQUOTA] = attr->cat_projid; - if (rc == 0 && osc_quota_chkdq(cli, qid) == NO_QUOTA) + if (rc == 0 && osc_quota_chkdq(cli, qid) == -EDQUOT) rc = -EDQUOT; if (rc) RETURN(rc); @@ -2431,8 +2375,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, /* it doesn't need any grant to dirty this page */ spin_lock(&cli->cl_loi_list_lock); - rc = osc_enter_cache_try(cli, oap, grants, 0); - spin_unlock(&cli->cl_loi_list_lock); + rc = osc_enter_cache_try(cli, oap, grants); if (rc == 0) { /* try failed */ grants = 0; need_release = 1; @@ -2446,10 +2389,11 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, } else { OSC_EXTENT_DUMP(D_CACHE, ext, "expanded for %lu.\n", index); - osc_unreserve_grant(cli, grants, tmp); + osc_unreserve_grant_nolock(cli, grants, tmp); grants = 0; } } + spin_unlock(&cli->cl_loi_list_lock); rc = 0; } else if (ext != NULL) { /* index is located outside of active extent */ @@ -2472,7 +2416,14 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, rc = 0; if (grants == 0) { - /* we haven't allocated grant for this page. */ + /* We haven't allocated grant for this page, and we + * must not hold a page lock while we do enter_cache, + * so we must mark dirty & unlock any pages in the + * write commit pagevec. */ + if (pagevec_count(pvec)) { + cb(env, io, pvec); + pagevec_reinit(pvec); + } rc = osc_enter_cache(env, cli, oap, tmp); if (rc == 0) grants = tmp; @@ -2611,7 +2562,7 @@ int osc_flush_async_page(const struct lu_env *env, struct cl_io *io, oap->oap_async_flags |= ASYNC_READY|ASYNC_URGENT; spin_unlock(&oap->oap_lock); - if (memory_pressure_get()) + if (current->flags & PF_MEMALLOC) ext->oe_memalloc = 1; ext->oe_urgent = 1; @@ -2633,71 +2584,6 @@ out: return rc; } -/** - * this is called when a sync waiter receives an interruption. Its job is to - * get the caller woken as soon as possible. If its page hasn't been put in an - * rpc yet it can dequeue immediately. Otherwise it has to mark the rpc as - * desiring interruption which will forcefully complete the rpc once the rpc - * has timed out. - */ -int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops) -{ - struct osc_async_page *oap = &ops->ops_oap; - struct osc_object *obj = oap->oap_obj; - struct client_obd *cli = osc_cli(obj); - struct osc_extent *ext; - struct osc_extent *found = NULL; - struct list_head *plist; - pgoff_t index = osc_index(ops); - int rc = -EBUSY; - int cmd; - ENTRY; - - LASSERT(!oap->oap_interrupted); - oap->oap_interrupted = 1; - - /* Find out the caching extent */ - osc_object_lock(obj); - if (oap->oap_cmd & OBD_BRW_WRITE) { - plist = &obj->oo_urgent_exts; - cmd = OBD_BRW_WRITE; - } else { - plist = &obj->oo_reading_exts; - cmd = OBD_BRW_READ; - } - list_for_each_entry(ext, plist, oe_link) { - if (ext->oe_start <= index && ext->oe_end >= index) { - LASSERT(ext->oe_state == OES_LOCK_DONE); - /* For OES_LOCK_DONE state extent, it has already held - * a refcount for RPC. */ - found = osc_extent_get(ext); - break; - } - } - if (found != NULL) { - list_del_init(&found->oe_link); - osc_update_pending(obj, cmd, -found->oe_nr_pages); - osc_object_unlock(obj); - - osc_extent_finish(env, found, 0, -EINTR); - osc_extent_put(env, found); - rc = 0; - } else { - osc_object_unlock(obj); - /* ok, it's been put in an rpc. only one oap gets a request - * reference */ - if (oap->oap_request != NULL) { - ptlrpc_mark_interrupted(oap->oap_request); - ptlrpcd_wake(oap->oap_request); - ptlrpc_req_finished(oap->oap_request); - oap->oap_request = NULL; - } - } - - osc_list_maint(cli, obj); - RETURN(rc); -} - int osc_queue_sync_pages(const struct lu_env *env, const struct cl_io *io, struct osc_object *obj, struct list_head *list, int brw_flags) @@ -2747,6 +2633,7 @@ int osc_queue_sync_pages(const struct lu_env *env, const struct cl_io *io, ext->oe_obj = obj; ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK); ext->oe_ndelay = !!(brw_flags & OBD_BRW_NDELAY); + ext->oe_dio = !!(brw_flags & OBD_BRW_NOCACHE); ext->oe_nr_pages = page_count; ext->oe_mppr = mppr; list_splice_init(list, &ext->oe_pages); @@ -2778,7 +2665,7 @@ int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj, struct osc_extent *ext; struct osc_extent *waiting = NULL; pgoff_t index; - struct list_head list = LIST_HEAD_INIT(list); + LIST_HEAD(list); int result = 0; bool partial; ENTRY; @@ -2996,7 +2883,7 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, pgoff_t start, pgoff_t end, int hp, int discard) { struct osc_extent *ext; - struct list_head discard_list = LIST_HEAD_INIT(discard_list); + LIST_HEAD(discard_list); bool unplug = false; int result = 0; ENTRY; @@ -3021,7 +2908,7 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, EASSERT(!ext->oe_hp, ext); ext->oe_hp = 1; list = &obj->oo_hp_exts; - } else if (!ext->oe_urgent) { + } else if (!ext->oe_urgent && !ext->oe_hp) { ext->oe_urgent = 1; list = &obj->oo_urgent_exts; } @@ -3029,10 +2916,25 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, list_move_tail(&ext->oe_link, list); unplug = true; } else { + struct client_obd *cli = osc_cli(obj); + int pcc_bits = cli->cl_chunkbits - PAGE_SHIFT; + pgoff_t align_by = (1 << pcc_bits); + pgoff_t a_start = round_down(start, align_by); + pgoff_t a_end = round_up(end, align_by); + + /* overflow case */ + if (end && !a_end) + a_end = CL_PAGE_EOF; /* the only discarder is lock cancelling, so - * [start, end] must contain this extent */ - EASSERT(ext->oe_start >= start && - ext->oe_max_end <= end, ext); + * [start, end], aligned by chunk size, must + * contain this extent */ + LASSERTF(ext->oe_start >= a_start && + ext->oe_end <= a_end, + "ext [%lu, %lu] reg [%lu, %lu] " + "orig [%lu %lu] align %lu bits " + "%d\n", ext->oe_start, ext->oe_end, + a_start, a_end, start, end, + align_by, pcc_bits); osc_extent_state_set(ext, OES_LOCKING); ext->oe_owner = current; list_move_tail(&ext->oe_link, @@ -3103,18 +3005,14 @@ EXPORT_SYMBOL(osc_cache_writeback_range); /** * Returns a list of pages by a given [start, end] of \a obj. * - * \param resched If not NULL, then we give up before hogging CPU for too - * long and set *resched = 1, in that case caller should implement a retry - * logic. - * * Gang tree lookup (radix_tree_gang_lookup()) optimization is absolutely * crucial in the face of [offset, EOF] locks. * * Return at least one page in @queue unless there is no covered page. */ -int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io, - struct osc_object *osc, pgoff_t start, pgoff_t end, - osc_page_gang_cbt cb, void *cbdata) +bool osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io, + struct osc_object *osc, pgoff_t start, pgoff_t end, + osc_page_gang_cbt cb, void *cbdata) { struct osc_page *ops; struct pagevec *pagevec; @@ -3123,7 +3021,7 @@ int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io, unsigned int nr; unsigned int i; unsigned int j; - int res = CLP_GANG_OKAY; + bool res = true; bool tree_lock = true; ENTRY; @@ -3172,7 +3070,7 @@ int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io, for (i = 0; i < j; ++i) { ops = pvec[i]; - if (res == CLP_GANG_OKAY) + if (res) res = (*cb)(env, io, ops, cbdata); page = ops->ops_cl.cpl_page; @@ -3184,10 +3082,10 @@ int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io, if (nr < OTI_PVEC_SIZE || end_of_region) break; - if (res == CLP_GANG_OKAY && need_resched()) - res = CLP_GANG_RESCHED; - if (res != CLP_GANG_OKAY) + if (!res) break; + if (need_resched()) + cond_resched(); spin_lock(&osc->oo_tree_lock); tree_lock = true; @@ -3201,7 +3099,7 @@ EXPORT_SYMBOL(osc_page_gang_lookup); /** * Check if page @page is covered by an extra lock or discard it. */ -static int check_and_discard_cb(const struct lu_env *env, struct cl_io *io, +static bool check_and_discard_cb(const struct lu_env *env, struct cl_io *io, struct osc_page *ops, void *cbdata) { struct osc_thread_info *info = osc_env_info(env); @@ -3236,10 +3134,10 @@ static int check_and_discard_cb(const struct lu_env *env, struct cl_io *io, } info->oti_next_index = index + 1; - return CLP_GANG_OKAY; + return true; } -int osc_discard_cb(const struct lu_env *env, struct cl_io *io, +bool osc_discard_cb(const struct lu_env *env, struct cl_io *io, struct osc_page *ops, void *cbdata) { struct osc_thread_info *info = osc_env_info(env); @@ -3260,7 +3158,7 @@ int osc_discard_cb(const struct lu_env *env, struct cl_io *io, LASSERT(page->cp_state == CPS_FREEING); } - return CLP_GANG_OKAY; + return true; } EXPORT_SYMBOL(osc_discard_cb); @@ -3278,7 +3176,6 @@ int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc, struct osc_thread_info *info = osc_env_info(env); struct cl_io *io = osc_env_thread_io(env); osc_page_gang_cbt cb; - int res; int result; ENTRY; @@ -3291,15 +3188,9 @@ int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc, cb = discard ? osc_discard_cb : check_and_discard_cb; info->oti_fn_index = info->oti_next_index = start; - do { - res = osc_page_gang_lookup(env, io, osc, - info->oti_next_index, end, cb, osc); - if (info->oti_next_index > end) - break; - if (res == CLP_GANG_RESCHED) - cond_resched(); - } while (res != CLP_GANG_OKAY); + osc_page_gang_lookup(env, io, osc, + info->oti_next_index, end, cb, osc); out: cl_io_fini(env, io); RETURN(result);