From 64a9bd496e41011c9f67aee61340dd21e256e93a Mon Sep 17 00:00:00 2001 From: zhanghc Date: Thu, 22 Oct 2009 14:44:42 +0000 Subject: [PATCH] b=16774 at client side, cancel unused locks before replay locks during recovery i=johann@sun.com i=oleg.drokin@sun.com --- lustre/include/lustre_dlm.h | 11 +++++ lustre/ldlm/ldlm_internal.h | 6 ++- lustre/ldlm/ldlm_request.c | 99 ++++++++++++++++++++++++++++++++++++++++++--- lustre/ldlm/ldlm_resource.c | 5 +++ lustre/mdc/mdc_request.c | 19 +++++++++ lustre/osc/osc_request.c | 24 +++++++++++ 6 files changed, 156 insertions(+), 8 deletions(-) diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h index 1696c7b..11d2605 100644 --- a/lustre/include/lustre_dlm.h +++ b/lustre/include/lustre_dlm.h @@ -358,6 +358,8 @@ typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock **, void *req_cookie, ldlm_mode_t mode, int flags, void *data); +typedef int (*ldlm_cancel_for_recovery)(struct ldlm_lock *lock); + struct ldlm_valblock_ops { int (*lvbo_init)(struct ldlm_resource *res); int (*lvbo_update)(struct ldlm_resource *res, struct ptlrpc_request *r, @@ -420,6 +422,9 @@ struct ldlm_namespace { struct adaptive_timeout ns_at_estimate;/* estimated lock callback time*/ /* backward link to obd, required for ldlm pool to store new SLV. */ struct obd_device *ns_obd; + + /* callback to cancel locks before replaying it during recovery */ + ldlm_cancel_for_recovery ns_cancel_for_recovery; }; static inline int ns_is_client(struct ldlm_namespace *ns) @@ -448,6 +453,12 @@ static inline int ns_connect_lru_resize(struct ldlm_namespace *ns) return !!(ns->ns_connect_flags & OBD_CONNECT_LRU_RESIZE); } +static inline void ns_register_cancel(struct ldlm_namespace *ns, + ldlm_cancel_for_recovery arg) +{ + LASSERT(ns != NULL); + ns->ns_cancel_for_recovery = arg; +} /* * * Resource hash table diff --git a/lustre/ldlm/ldlm_internal.h b/lustre/ldlm/ldlm_internal.h index 2d898d6..d62520a 100644 --- a/lustre/ldlm/ldlm_internal.h +++ b/lustre/ldlm/ldlm_internal.h @@ -72,7 +72,8 @@ enum { LDLM_CANCEL_AGED = 1 << 0, /* Cancel aged locks (non lru resize). */ LDLM_CANCEL_PASSED = 1 << 1, /* Cancel passed number of locks. */ LDLM_CANCEL_SHRINK = 1 << 2, /* Cancel locks from shrinker. */ - LDLM_CANCEL_LRUR = 1 << 3 /* Cancel locks from lru resize. */ + LDLM_CANCEL_LRUR = 1 << 3, /* Cancel locks from lru resize. */ + LDLM_CANCEL_NO_WAIT = 1 << 4 /* Cancel locks w/o blocking (neither * sending nor waiting fro any rpcs) */ }; int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync, @@ -194,7 +195,8 @@ void ldlm_exit(void); enum ldlm_policy_res { LDLM_POLICY_CANCEL_LOCK, - LDLM_POLICY_KEEP_LOCK + LDLM_POLICY_KEEP_LOCK, + LDLM_POLICY_SKIP_LOCK }; typedef enum ldlm_policy_res ldlm_policy_res_t; diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index b6e402b..bbd6ea9 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -50,6 +50,9 @@ int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT; CFS_MODULE_PARM(ldlm_enqueue_min, "i", int, 0644, "lock enqueue timeout minimum"); +/* in client side, whether the cached locks will be canceled before replay */ +unsigned int ldlm_cancel_unused_locks_before_replay = 1; + static void interrupted_completion_wait(void *data) { } @@ -1178,6 +1181,33 @@ static int ldlm_cancel_list(struct list_head *cancels, int count, int flags) RETURN(count); } +/* cancel as many locks as possible w/o sending any rpcs (e.g. to write back + * dirty data, to close a file, ...) or waiting for any rpcs in-flight (e.g. + * readahead requests, ...) */ +static ldlm_policy_res_t ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, + struct ldlm_lock *lock, + int unused, int added, + int count) +{ + ldlm_policy_res_t result = LDLM_POLICY_CANCEL_LOCK; + ldlm_cancel_for_recovery cb = ns->ns_cancel_for_recovery; + check_res_locked(lock->l_resource); + + /* don't check added & count since we want to process all locks + * from unused list */ + switch (lock->l_resource->lr_type) { + case LDLM_EXTENT: + case LDLM_IBITS: + if (cb && cb(lock)) + break; + default: + result = LDLM_POLICY_SKIP_LOCK; + break; + } + + RETURN(result); +} + /* Return 1 to stop lru processing and keep current lock cached. Return zero * otherwise. */ static ldlm_policy_res_t ldlm_cancel_lrur_policy(struct ldlm_namespace *ns, @@ -1260,6 +1290,9 @@ typedef ldlm_policy_res_t (*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *, static ldlm_cancel_lru_policy_t ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags) { + if (flags & LDLM_CANCEL_NO_WAIT) + return ldlm_cancel_no_wait_policy; + if (ns_connect_lru_resize(ns)) { if (flags & LDLM_CANCEL_SHRINK) /* We kill passed number of old locks. */ @@ -1301,17 +1334,23 @@ ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags) * memory pressre policy function; * * flags & LDLM_CANCEL_AGED - cancel locks according to "aged policy". + * + * flags & LDLM_CANCEL_NO_WAIT - cancel as many unused locks as possible + * (typically before replaying locks) w/o + * sending any rpcs or waiting for any + * outstanding rpc to complete */ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels, int count, int max, int cancel_flags, int flags) { ldlm_cancel_lru_policy_t pf; struct ldlm_lock *lock, *next; - int added = 0, unused; + int added = 0, unused, remained; ENTRY; spin_lock(&ns->ns_unused_lock); unused = ns->ns_nr_unused; + remained = unused; if (!ns_connect_lru_resize(ns)) count += unused - ns->ns_max_unused; @@ -1320,6 +1359,12 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels, LASSERT(pf != NULL); while (!list_empty(&ns->ns_unused_list)) { + ldlm_policy_res_t result; + + /* Are all unused locks checked? */ + if (remained-- <= 0) + break; + /* For any flags, stop scanning if @max is reached. */ if (max && added >= max) break; @@ -1338,6 +1383,10 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels, if (&lock->l_lru == &ns->ns_unused_list) break; + LDLM_LOCK_GET(lock); /* dropped by bl thread */ + spin_unlock(&ns->ns_unused_lock); + + lock_res_and_lock(lock); /* Pass the lock through the policy filter and see if it * should stay in lru. * @@ -1351,13 +1400,20 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels, * old locks, but additionally chose them by * their weight. Big extent locks will stay in * the cache. */ - if (pf(ns, lock, unused, added, count) == LDLM_POLICY_KEEP_LOCK) + result = pf(ns, lock, unused, added, count); + if (result == LDLM_POLICY_KEEP_LOCK) { + unlock_res_and_lock(lock); + LDLM_LOCK_PUT(lock); + spin_lock(&ns->ns_unused_lock); break; + } + if (result == LDLM_POLICY_SKIP_LOCK) { + unlock_res_and_lock(lock); + LDLM_LOCK_PUT(lock); + spin_lock(&ns->ns_unused_lock); + continue; + } - LDLM_LOCK_GET(lock); /* dropped by bl thread */ - spin_unlock(&ns->ns_unused_lock); - - lock_res_and_lock(lock); /* Check flags again under the lock. */ if ((lock->l_flags & LDLM_FL_CANCELING) || (ldlm_lock_remove_from_lru(lock) == 0)) { @@ -1946,6 +2002,34 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock) RETURN(0); } +/* cancel as many unused locks as possible before replay. since we are + * in recovery, we can't wait for any outstanding RPCs ro send any RPC + * to the server. + * + * callced only in recovery before replaying locks. there is no need to + * replay locks that are unused. since the clients may hold thousands of + * cached unused locks, dropping the unused locks can greatly reduce the + * load on the servers at recovery time. + */ +static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns) +{ + int canceled; + CFS_LIST_HEAD(cancels); + + CDEBUG(D_DLMTRACE, "Dropping as many as unused locks as possible before" "replay for namespace %s (%d)\n", ns->ns_name, + ns->ns_nr_unused); + + /* we don't need to care whether or not LRU resize is enabled + * because the LDLM_CANCLE_NO_WAIT policy doesn't use the + * count parameter */ + canceled = ldlm_cancel_lru_local(ns, &cancels, ns->ns_nr_unused, 0, + LDLM_FL_LOCAL_ONLY, + LDLM_CANCEL_NO_WAIT); + + CDEBUG(D_DLMTRACE, "Canceled %d unused locks from namespace %s\n", + canceled, ns->ns_name); +} + int ldlm_replay_locks(struct obd_import *imp) { struct ldlm_namespace *ns = imp->imp_obd->obd_namespace; @@ -1961,6 +2045,9 @@ int ldlm_replay_locks(struct obd_import *imp) /* ensure this doesn't fall to 0 before all have been queued */ atomic_inc(&imp->imp_replay_inflight); + if (ldlm_cancel_unused_locks_before_replay) + ldlm_cancel_unused_locks_for_replay(ns); + if (imp->imp_no_lock_replay) { /* VBR: locks should be cancelled here */ ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c index 99e85af..b767812 100644 --- a/lustre/ldlm/ldlm_resource.c +++ b/lustre/ldlm/ldlm_resource.c @@ -66,6 +66,8 @@ cfs_proc_dir_entry_t *ldlm_type_proc_dir = NULL; cfs_proc_dir_entry_t *ldlm_ns_proc_dir = NULL; cfs_proc_dir_entry_t *ldlm_svc_proc_dir = NULL; +extern unsigned int ldlm_cancel_unused_locks_before_replay; + #ifdef LPROCFS static int ldlm_proc_dump_ns(struct file *file, const char *buffer, unsigned long count, void *data) @@ -80,6 +82,9 @@ int ldlm_proc_setup(void) int rc; struct lprocfs_vars list[] = { { "dump_namespaces", NULL, ldlm_proc_dump_ns, NULL }, + { "cancel_unused_locks_before_replay", + lprocfs_rd_uint, lprocfs_wr_uint, + &ldlm_cancel_unused_locks_before_replay, NULL }, { NULL }}; ENTRY; LASSERT(ldlm_ns_proc_dir == NULL); diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index 2fb34e2..bae823f 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -1356,6 +1356,23 @@ static int mdc_import_event(struct obd_device *obd, struct obd_import *imp, RETURN(rc); } +/* determine whether the lock can be canceled before replaying it during + * recovery, non zero value will be return if the lock can be canceled, + * or zero returned for not */ +static int mdc_cancel_for_recovery(struct ldlm_lock *lock) +{ + if (lock->l_resource->lr_type != LDLM_IBITS) + RETURN(0); + + /* FIXME: if we ever get into a situation where there are too many + * opened files with open locks on a single node, then we really + * should replay these open locks to reget it */ + if (lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_OPEN) + RETURN(0); + + RETURN(1); +} + static int mdc_setup(struct obd_device *obd, obd_count len, void *buf) { struct client_obd *cli = &obd->u.cli; @@ -1387,6 +1404,8 @@ static int mdc_setup(struct obd_device *obd, obd_count len, void *buf) if (lprocfs_obd_setup(obd, lvars.obd_vars) == 0) ptlrpc_lprocfs_register_obd(obd); + ns_register_cancel(obd->obd_namespace, mdc_cancel_for_recovery); + rc = obd_llog_init(obd, obd, NULL); if (rc) { mdc_cleanup(obd); diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index c890c71..2ccb080 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -4338,6 +4338,28 @@ static int osc_import_event(struct obd_device *obd, RETURN(rc); } +/* determine whether the lock can be canceled before replaying the lock + * during recovery, see bug16774 for detailed information + * + * return values: + * zero - the lock can't be canceled + * other - ok to cancel + */ +static int osc_cancel_for_recovery(struct ldlm_lock *lock) +{ + check_res_locked(lock->l_resource); + if (lock->l_granted_mode == LCK_GROUP || + lock->l_resource->lr_type != LDLM_EXTENT) + RETURN(0); + + /* cancel all unused extent locks with granted mode LCK_PR or LCK_CR */ + if (lock->l_granted_mode == LCK_PR || + lock->l_granted_mode == LCK_CR) + RETURN(1); + + RETURN(0); +} + int osc_setup(struct obd_device *obd, obd_count len, void *buf) { int rc; @@ -4379,6 +4401,8 @@ int osc_setup(struct obd_device *obd, obd_count len, void *buf) } CFS_INIT_LIST_HEAD(&cli->cl_grant_shrink_list); sema_init(&cli->cl_grant_sem, 1); + + ns_register_cancel(obd->obd_namespace, osc_cancel_for_recovery); } RETURN(rc); -- 1.8.3.1