From 0a6c6fcd46a4e2eb289eff72402e34d329a63d91 Mon Sep 17 00:00:00 2001 From: Jinshan Xiong Date: Mon, 23 Jun 2014 14:20:52 -0700 Subject: [PATCH] LU-4786 osc: to not pick busy pages for ELC This is a combination of commit 154fb1f7 from LU-3321 and commit bfae5a4e from LU-4300. This patch is only required in b2_5 as the code is already in 2.6. Use weigh_ast to decide if a lock covers any pages. In recovery, weigh_ast will be used to decide if a DLM read lock covers any locked pages, or it will be canceled instead being recovered. The problem with the original implementation is that it attached each osc_page to an osc_lock also changed lock state to add every pages for readahead. Bugzilla-bug-Id: b=16774 Change the policy of ELC to pick locks that have no dirty pages, no page in writeback state, and no locked pages. Signed-off-by: Jinshan Xiong Change-Id: Ie857a4dd4e235f0c3273f53cbfab9592c2793ac9 Reviewed-on: http://review.whamcloud.com/10795 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Bobi Jam Reviewed-by: Oleg Drokin --- lustre/include/lustre_dlm.h | 15 +++--- lustre/ldlm/ldlm_request.c | 53 +++++++++++-------- lustre/mdc/mdc_request.c | 20 +++---- lustre/osc/osc_cl_internal.h | 20 ------- lustre/osc/osc_internal.h | 3 +- lustre/osc/osc_lock.c | 121 ++++++++++++++++++++++++++++++------------- lustre/osc/osc_page.c | 85 ++++-------------------------- lustre/osc/osc_request.c | 25 ++++----- 8 files changed, 158 insertions(+), 184 deletions(-) diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h index 30b70b5..7ec97dd 100644 --- a/lustre/include/lustre_dlm.h +++ b/lustre/include/lustre_dlm.h @@ -279,7 +279,7 @@ typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock **, void *req_cookie, ldlm_mode_t mode, __u64 flags, void *data); -typedef int (*ldlm_cancel_for_recovery)(struct ldlm_lock *lock); +typedef int (*ldlm_cancel_cbt)(struct ldlm_lock *lock); /** * LVB operations. @@ -497,8 +497,11 @@ struct ldlm_namespace { /** Limit of parallel AST RPC count. */ unsigned ns_max_parallel_ast; - /** Callback to cancel locks before replaying it during recovery. */ - ldlm_cancel_for_recovery ns_cancel_for_recovery; + /** + * Callback to check if a lock is good to be canceled by ELC or + * during recovery. + */ + ldlm_cancel_cbt ns_cancel; /** LDLM lock stats */ struct lprocfs_stats *ns_stats; @@ -555,10 +558,10 @@ static inline int ns_connect_lru_resize(struct ldlm_namespace *ns) } static inline void ns_register_cancel(struct ldlm_namespace *ns, - ldlm_cancel_for_recovery arg) + ldlm_cancel_cbt arg) { - LASSERT(ns != NULL); - ns->ns_cancel_for_recovery = arg; + LASSERT(ns != NULL); + ns->ns_cancel = arg; } struct ldlm_lock; diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index 5dc364b..d59e29d 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -1449,23 +1449,22 @@ static ldlm_policy_res_t ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, int count) { ldlm_policy_res_t result = LDLM_POLICY_CANCEL_LOCK; - ldlm_cancel_for_recovery cb = ns->ns_cancel_for_recovery; - lock_res_and_lock(lock); /* don't check added & count since we want to process all locks * from unused list */ switch (lock->l_resource->lr_type) { case LDLM_EXTENT: case LDLM_IBITS: - if (cb && cb(lock)) + if (ns->ns_cancel != NULL && ns->ns_cancel(lock) != 0) break; default: result = LDLM_POLICY_SKIP_LOCK; + lock_res_and_lock(lock); lock->l_flags |= LDLM_FL_SKIPPED; + unlock_res_and_lock(lock); break; } - unlock_res_and_lock(lock); RETURN(result); } @@ -1483,20 +1482,20 @@ static ldlm_policy_res_t ldlm_cancel_lrur_policy(struct ldlm_namespace *ns, int unused, int added, int count) { - cfs_time_t cur = cfs_time_current(); - struct ldlm_pool *pl = &ns->ns_pool; - __u64 slv, lvf, lv; - cfs_time_t la; + cfs_time_t cur = cfs_time_current(); + struct ldlm_pool *pl = &ns->ns_pool; + __u64 slv, lvf, lv; + cfs_time_t la; /* Stop LRU processing when we reach past @count or have checked all * locks in LRU. */ - if (count && added >= count) - return LDLM_POLICY_KEEP_LOCK; + if (count && added >= count) + return LDLM_POLICY_KEEP_LOCK; - slv = ldlm_pool_get_slv(pl); - lvf = ldlm_pool_get_lvf(pl); - la = cfs_duration_sec(cfs_time_sub(cur, - lock->l_last_used)); + slv = ldlm_pool_get_slv(pl); + lvf = ldlm_pool_get_lvf(pl); + la = cfs_duration_sec(cfs_time_sub(cur, + lock->l_last_used)); lv = lvf * la * unused; /* Inform pool about current CLV to see it via proc. */ @@ -1504,8 +1503,13 @@ static ldlm_policy_res_t ldlm_cancel_lrur_policy(struct ldlm_namespace *ns, /* Stop when SLV is not yet come from server or lv is smaller than * it is. */ - return (slv == 0 || lv < slv) ? - LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK; + if (slv == 0 || lv < slv) + return LDLM_POLICY_KEEP_LOCK; + + if (ns->ns_cancel != NULL && ns->ns_cancel(lock) == 0) + return LDLM_POLICY_KEEP_LOCK; + + return LDLM_POLICY_CANCEL_LOCK; } /** @@ -1542,12 +1546,17 @@ static ldlm_policy_res_t ldlm_cancel_aged_policy(struct ldlm_namespace *ns, int unused, int added, int count) { - /* Stop LRU processing if young lock is found and we reach past count */ - return ((added >= count) && - cfs_time_before(cfs_time_current(), - cfs_time_add(lock->l_last_used, - ns->ns_max_age))) ? - LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK; + if (added >= count) + return LDLM_POLICY_KEEP_LOCK; + + if (cfs_time_before(cfs_time_current(), + cfs_time_add(lock->l_last_used, ns->ns_max_age))) + return LDLM_POLICY_KEEP_LOCK; + + if (ns->ns_cancel != NULL && ns->ns_cancel(lock) == 0) + return LDLM_POLICY_KEEP_LOCK; + + return LDLM_POLICY_CANCEL_LOCK; } /** diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index 5d331f0..0aca920 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -2467,18 +2467,18 @@ struct obd_uuid *mdc_get_uuid(struct obd_export *exp) { * recovery, non zero value will be return if the lock can be canceled, * or zero returned for not */ -static int mdc_cancel_for_recovery(struct ldlm_lock *lock) +static int mdc_cancel_weight(struct ldlm_lock *lock) { - if (lock->l_resource->lr_type != LDLM_IBITS) - RETURN(0); + if (lock->l_resource->lr_type != LDLM_IBITS) + RETURN(0); - /* FIXME: if we ever get into a situation where there are too many - * opened files with open locks on a single node, then we really - * should replay these open locks to reget it */ - if (lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_OPEN) - RETURN(0); + /* FIXME: if we ever get into a situation where there are too many + * opened files with open locks on a single node, then we really + * should replay these open locks to reget it */ + if (lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_OPEN) + RETURN(0); - RETURN(1); + RETURN(1); } static int mdc_resource_inode_free(struct ldlm_resource *res) @@ -2523,7 +2523,7 @@ static int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg) sptlrpc_lprocfs_cliobd_attach(obd); ptlrpc_lprocfs_register_obd(obd); - ns_register_cancel(obd->obd_namespace, mdc_cancel_for_recovery); + ns_register_cancel(obd->obd_namespace, mdc_cancel_weight); obd->obd_namespace->ns_lvbo = &inode_lvbo; diff --git a/lustre/osc/osc_cl_internal.h b/lustre/osc/osc_cl_internal.h index a8c2ec5..01153f1 100644 --- a/lustre/osc/osc_cl_internal.h +++ b/lustre/osc/osc_cl_internal.h @@ -262,16 +262,6 @@ struct osc_lock { enum osc_lock_state ols_state; /** - * How many pages are using this lock for io, currently only used by - * read-ahead. If non-zero, the underlying dlm lock won't be cancelled - * during recovery to avoid deadlock. see bz16774. - * - * \see osc_page::ops_lock - * \see osc_page_addref_lock(), osc_page_putref_lock() - */ - cfs_atomic_t ols_pageref; - - /** * true, if ldlm_lock_addref() was called against * osc_lock::ols_lock. This is used for sanity checking. * @@ -391,16 +381,6 @@ struct osc_page { * Submit time - the time when the page is starting RPC. For debugging. */ cfs_time_t ops_submit_time; - - /** - * A lock of which we hold a reference covers this page. Only used by - * read-ahead: for a readahead page, we hold it's covering lock to - * prevent it from being canceled during recovery. - * - * \see osc_lock::ols_pageref - * \see osc_page_addref_lock(), osc_page_putref_lock(). - */ - struct cl_lock *ops_lock; }; extern struct kmem_cache *osc_lock_kmem; diff --git a/lustre/osc/osc_internal.h b/lustre/osc/osc_internal.h index 3fd6522..f93c723 100644 --- a/lustre/osc/osc_internal.h +++ b/lustre/osc/osc_internal.h @@ -132,6 +132,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, int osc_lru_shrink(struct client_obd *cli, int target); extern spinlock_t osc_ast_guard; +unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock); int osc_cleanup(struct obd_device *obd); int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg); @@ -186,8 +187,6 @@ static inline struct osc_device *obd2osc_dev(const struct obd_device *d) return container_of0(d->obd_lu_dev, struct osc_device, od_cl.cd_lu_dev); } -int osc_dlm_lock_pageref(struct ldlm_lock *dlm); - extern struct kmem_cache *osc_quota_kmem; struct osc_quota_info { /** linkage for quota hash table */ diff --git a/lustre/osc/osc_lock.c b/lustre/osc/osc_lock.c index 8c50766..b915b80 100644 --- a/lustre/osc/osc_lock.c +++ b/lustre/osc/osc_lock.c @@ -54,8 +54,6 @@ * @{ */ -#define _PAGEREF_MAGIC (-10000000) - /***************************************************************************** * * Type conversions. @@ -252,9 +250,6 @@ static void osc_lock_fini(const struct lu_env *env, */ osc_lock_unhold(ols); LASSERT(ols->ols_lock == NULL); - LASSERT(cfs_atomic_read(&ols->ols_pageref) == 0 || - cfs_atomic_read(&ols->ols_pageref) == _PAGEREF_MAGIC); - OBD_SLAB_FREE_PTR(ols, osc_lock_kmem); } @@ -910,14 +905,92 @@ static int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data) return result; } -static unsigned long osc_lock_weigh(const struct lu_env *env, - const struct cl_lock_slice *slice) +static int weigh_cb(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, void *cbdata) { - /* - * don't need to grab coh_page_guard since we don't care the exact # - * of pages.. - */ - return cl_object_header(slice->cls_obj)->coh_pages; +#if defined(__KERNEL__) + struct page *vmpage = cl_page_vmpage(env, page); + + if (PageLocked(vmpage) || PageDirty(vmpage) || PageWriteback(vmpage)) { + (*(unsigned long *)cbdata)++; + return CLP_GANG_ABORT; + } +#endif + + return CLP_GANG_OKAY; +} + +static unsigned long osc_lock_weight(const struct lu_env *env, + const struct osc_lock *ols) +{ + struct cl_io *io = &osc_env_info(env)->oti_io; + struct cl_lock_descr *descr = &ols->ols_cl.cls_lock->cll_descr; + struct cl_object *obj = ols->ols_cl.cls_obj; + unsigned long npages = 0; + int result; + ENTRY; + + io->ci_obj = cl_object_top(obj); + io->ci_ignore_layout = 1; + result = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (result != 0) + RETURN(result); + + do { + result = cl_page_gang_lookup(env, obj, io, + descr->cld_start, descr->cld_end, + weigh_cb, (void *)&npages); + if (result == CLP_GANG_ABORT) + break; + if (result == CLP_GANG_RESCHED) + cond_resched(); + } while (result != CLP_GANG_OKAY); + cl_io_fini(env, io); + + return npages; +} + +/** + * Get the weight of dlm lock for early cancellation. + */ +unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock) +{ + struct cl_env_nest nest; + struct lu_env *env; + struct osc_lock *lock; + unsigned long weight; + ENTRY; + + might_sleep(); + /* + * osc_ldlm_weigh_ast has a complex context since it might be called + * because of lock canceling, or from user's input. We have to make + * a new environment for it. Probably it is implementation safe to use + * the upper context because cl_lock_put don't modify environment + * variables. But just in case .. + */ + env = cl_env_nested_get(&nest); + if (IS_ERR(env)) + /* Mostly because lack of memory, do not eliminate this lock */ + RETURN(1); + + LASSERT(dlmlock->l_resource->lr_type == LDLM_EXTENT); + lock = osc_ast_data_get(dlmlock); + if (lock == NULL) { + /* cl_lock was destroyed because of memory pressure. + * It is much reasonable to assign this type of lock + * a lower cost. + */ + GOTO(out, weight = 0); + } + + weight = osc_lock_weight(env, lock); + osc_ast_data_put(env, lock); + EXIT; + +out: + cl_env_nested_put(&nest, env); + return weight; } static void osc_lock_build_einfo(const struct lu_env *env, @@ -1555,7 +1628,6 @@ static const struct cl_lock_operations osc_lock_ops = { .clo_delete = osc_lock_delete, .clo_state = osc_lock_state, .clo_cancel = osc_lock_cancel, - .clo_weigh = osc_lock_weigh, .clo_print = osc_lock_print, .clo_fits_into = osc_lock_fits_into, }; @@ -1656,7 +1728,6 @@ int osc_lock_init(const struct lu_env *env, __u32 enqflags = lock->cll_descr.cld_enq_flags; osc_lock_build_einfo(env, lock, clk, &clk->ols_einfo); - cfs_atomic_set(&clk->ols_pageref, 0); clk->ols_state = OLS_NEW; clk->ols_flags = osc_enq2ldlm_flags(enqflags); @@ -1683,26 +1754,4 @@ int osc_lock_init(const struct lu_env *env, return result; } -int osc_dlm_lock_pageref(struct ldlm_lock *dlm) -{ - struct osc_lock *olock; - int rc = 0; - - spin_lock(&osc_ast_guard); - olock = dlm->l_ast_data; - /* - * there's a very rare race with osc_page_addref_lock(), but that - * doesn't matter because in the worst case we don't cancel a lock - * which we actually can, that's no harm. - */ - if (olock != NULL && - cfs_atomic_add_return(_PAGEREF_MAGIC, - &olock->ols_pageref) != _PAGEREF_MAGIC) { - cfs_atomic_sub(_PAGEREF_MAGIC, &olock->ols_pageref); - rc = 1; - } - spin_unlock(&osc_ast_guard); - return rc; -} - /** @} osc */ diff --git a/lustre/osc/osc_page.c b/lustre/osc/osc_page.c index 37b62af..f158c2c 100644 --- a/lustre/osc/osc_page.c +++ b/lustre/osc/osc_page.c @@ -165,9 +165,6 @@ static int osc_page_protected(const struct lu_env *env, static void osc_page_fini(const struct lu_env *env, struct cl_page_slice *slice) { - struct osc_page *opg = cl2osc_page(slice); - CDEBUG(D_TRACE, "%p\n", opg); - LASSERT(opg->ops_lock == NULL); } static void osc_page_transfer_get(struct osc_page *opg, const char *label) @@ -251,42 +248,6 @@ void osc_index2policy(ldlm_policy_data_t *policy, const struct cl_object *obj, policy->l_extent.end = cl_offset(obj, end + 1) - 1; } -static int osc_page_addref_lock(const struct lu_env *env, - struct osc_page *opg, - struct cl_lock *lock) -{ - struct osc_lock *olock; - int rc; - - LASSERT(opg->ops_lock == NULL); - - olock = osc_lock_at(lock); - if (cfs_atomic_inc_return(&olock->ols_pageref) <= 0) { - cfs_atomic_dec(&olock->ols_pageref); - rc = -ENODATA; - } else { - cl_lock_get(lock); - opg->ops_lock = lock; - rc = 0; - } - return rc; -} - -static void osc_page_putref_lock(const struct lu_env *env, - struct osc_page *opg) -{ - struct cl_lock *lock = opg->ops_lock; - struct osc_lock *olock; - - LASSERT(lock != NULL); - olock = osc_lock_at(lock); - - cfs_atomic_dec(&olock->ols_pageref); - opg->ops_lock = NULL; - - cl_lock_put(env, lock); -} - static int osc_page_is_under_lock(const struct lu_env *env, const struct cl_page_slice *slice, struct cl_io *unused) @@ -298,38 +259,15 @@ static int osc_page_is_under_lock(const struct lu_env *env, lock = cl_lock_at_page(env, slice->cpl_obj, slice->cpl_page, NULL, 1, 0); if (lock != NULL) { - if (osc_page_addref_lock(env, cl2osc_page(slice), lock) == 0) - result = -EBUSY; cl_lock_put(env, lock); + result = -EBUSY; } RETURN(result); } -static void osc_page_disown(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *io) -{ - struct osc_page *opg = cl2osc_page(slice); - - if (unlikely(opg->ops_lock)) - osc_page_putref_lock(env, opg); -} - -static void osc_page_completion_read(const struct lu_env *env, - const struct cl_page_slice *slice, - int ioret) -{ - struct osc_page *opg = cl2osc_page(slice); - struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); - - if (likely(opg->ops_lock)) - osc_page_putref_lock(env, opg); - osc_lru_add(osc_cli(obj), opg); -} - -static void osc_page_completion_write(const struct lu_env *env, - const struct cl_page_slice *slice, - int ioret) +static void osc_page_completion(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) { struct osc_page *opg = cl2osc_page(slice); struct osc_object *obj = cl2osc(slice->cpl_obj); @@ -491,15 +429,14 @@ static const struct cl_page_operations osc_page_ops = { .cpo_print = osc_page_print, .cpo_delete = osc_page_delete, .cpo_is_under_lock = osc_page_is_under_lock, - .cpo_disown = osc_page_disown, - .io = { - [CRT_READ] = { - .cpo_cache_add = osc_page_fail, - .cpo_completion = osc_page_completion_read - }, - [CRT_WRITE] = { + .io = { + [CRT_READ] = { + .cpo_cache_add = osc_page_fail, + .cpo_completion = osc_page_completion + }, + [CRT_WRITE] = { .cpo_cache_add = osc_page_cache_add, - .cpo_completion = osc_page_completion_write + .cpo_completion = osc_page_completion } }, .cpo_clip = osc_page_clip, diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index cb4ba3a..30821395 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -3451,21 +3451,18 @@ static int osc_import_event(struct obd_device *obd, */ static int osc_cancel_for_recovery(struct ldlm_lock *lock) { - check_res_locked(lock->l_resource); - - /* - * Cancel all unused extent lock in granted mode LCK_PR or LCK_CR. - * - * XXX as a future improvement, we can also cancel unused write lock - * if it doesn't have dirty data and active mmaps. - */ - if (lock->l_resource->lr_type == LDLM_EXTENT && - (lock->l_granted_mode == LCK_PR || - lock->l_granted_mode == LCK_CR) && - (osc_dlm_lock_pageref(lock) == 0)) - RETURN(1); + /* + * Cancel all unused extent lock in granted mode LCK_PR or LCK_CR. + * + * XXX as a future improvement, we can also cancel unused write lock + * if it doesn't have dirty data and active mmaps. + */ + if (lock->l_resource->lr_type == LDLM_EXTENT && + (lock->l_granted_mode == LCK_PR || lock->l_granted_mode == LCK_CR)&& + osc_ldlm_weigh_ast(lock) == 0) + RETURN(1); - RETURN(0); + RETURN(0); } static int brw_queue_work(const struct lu_env *env, void *data) -- 1.8.3.1