From 9011cfeec10f67583717ec1daf128975a1d9d72d Mon Sep 17 00:00:00 2001 From: Eric Mei Date: Mon, 30 Aug 2010 19:24:57 +0400 Subject: [PATCH] b=16774 mdc to cancel unused dlm locks before replay. r=oleg.drokin r=di.wang --- lustre/include/lustre_dlm.h | 16 +++++++ lustre/ldlm/ldlm_internal.h | 7 ++- lustre/ldlm/ldlm_lock.c | 2 + lustre/ldlm/ldlm_request.c | 99 +++++++++++++++++++++++++++++++++++++++++-- lustre/ldlm/ldlm_resource.c | 5 +++ lustre/mdc/mdc_request.c | 21 +++++++++ lustre/tests/replay-single.sh | 24 +++++++++++ 7 files changed, 169 insertions(+), 5 deletions(-) diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h index 1747a77..bf0d9f8 100644 --- a/lustre/include/lustre_dlm.h +++ b/lustre/include/lustre_dlm.h @@ -203,6 +203,10 @@ typedef enum { * emulation + race with upcoming bl_ast. */ #define LDLM_FL_FAIL_LOC 0x100000000ULL +/* Used while processing the unused list to know that we have already + * handled this lock and decided to skip it */ +#define LDLM_FL_SKIPPED 0x200000000ULL + /* The blocking callback is overloaded to perform two functions. These flags * indicate which operation should be performed. */ #define LDLM_CB_BLOCKING 1 @@ -368,6 +372,8 @@ typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock **, void *req_cookie, ldlm_mode_t mode, int flags, void *data); +typedef int (*ldlm_cancel_for_recovery)(struct ldlm_lock *lock); + struct ldlm_valblock_ops { int (*lvbo_init)(struct ldlm_resource *res); int (*lvbo_update)(struct ldlm_resource *res, @@ -484,6 +490,9 @@ struct ldlm_namespace { struct obd_device *ns_obd; struct adaptive_timeout ns_at_estimate;/* estimated lock callback time*/ + + /* callback to cancel locks before replaying it during recovery */ + ldlm_cancel_for_recovery ns_cancel_for_recovery; }; static inline int ns_is_client(struct ldlm_namespace *ns) @@ -512,6 +521,13 @@ static inline int ns_connect_lru_resize(struct ldlm_namespace *ns) return !!(ns->ns_connect_flags & OBD_CONNECT_LRU_RESIZE); } +static inline void ns_register_cancel(struct ldlm_namespace *ns, + ldlm_cancel_for_recovery arg) +{ + LASSERT(ns != NULL); + ns->ns_cancel_for_recovery = arg; +} + /* * * Resource hash table diff --git a/lustre/ldlm/ldlm_internal.h b/lustre/ldlm/ldlm_internal.h index d6ff17f..c6345cb 100644 --- a/lustre/ldlm/ldlm_internal.h +++ b/lustre/ldlm/ldlm_internal.h @@ -72,7 +72,9 @@ enum { LDLM_CANCEL_AGED = 1 << 0, /* Cancel aged locks (non lru resize). */ LDLM_CANCEL_PASSED = 1 << 1, /* Cancel passed number of locks. */ LDLM_CANCEL_SHRINK = 1 << 2, /* Cancel locks from shrinker. */ - LDLM_CANCEL_LRUR = 1 << 3 /* Cancel locks from lru resize. */ + LDLM_CANCEL_LRUR = 1 << 3, /* Cancel locks from lru resize. */ + LDLM_CANCEL_NO_WAIT = 1 << 4 /* Cancel locks w/o blocking (neither + * sending nor waiting for any rpcs) */ }; int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync, @@ -200,7 +202,8 @@ void ldlm_exit(void); enum ldlm_policy_res { LDLM_POLICY_CANCEL_LOCK, - LDLM_POLICY_KEEP_LOCK + LDLM_POLICY_KEEP_LOCK, + LDLM_POLICY_SKIP_LOCK }; typedef enum ldlm_policy_res ldlm_policy_res_t; diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index 2cdd68b..c000c42b 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -187,6 +187,8 @@ int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock) struct ldlm_namespace *ns = lock->l_resource->lr_namespace; LASSERT(lock->l_resource->lr_type != LDLM_FLOCK); cfs_list_del_init(&lock->l_lru); + if (lock->l_flags & LDLM_FL_SKIPPED) + lock->l_flags &= ~LDLM_FL_SKIPPED; LASSERT(ns->ns_nr_unused > 0); ns->ns_nr_unused--; rc = 1; diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index e283eee..dc59d90 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -50,6 +50,9 @@ int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT; CFS_MODULE_PARM(ldlm_enqueue_min, "i", int, 0644, "lock enqueue timeout minimum"); +/* in client side, whether the cached locks will be canceled before replay */ +unsigned int ldlm_cancel_unused_locks_before_replay = 1; + static void interrupted_completion_wait(void *data) { } @@ -1311,6 +1314,37 @@ int ldlm_cli_cancel_list_local(cfs_list_t *cancels, int count, } /** + * Cancel as many locks as possible w/o sending any rpcs (e.g. to write back + * dirty data, to close a file, ...) or waiting for any rpcs in-flight (e.g. + * readahead requests, ...) + */ +static ldlm_policy_res_t ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, + struct ldlm_lock *lock, + int unused, int added, + int count) +{ + ldlm_policy_res_t result = LDLM_POLICY_CANCEL_LOCK; + ldlm_cancel_for_recovery cb = ns->ns_cancel_for_recovery; + lock_res_and_lock(lock); + + /* don't check added & count since we want to process all locks + * from unused list */ + switch (lock->l_resource->lr_type) { + case LDLM_EXTENT: + case LDLM_IBITS: + if (cb && cb(lock)) + break; + default: + result = LDLM_POLICY_SKIP_LOCK; + lock->l_flags |= LDLM_FL_SKIPPED; + break; + } + + unlock_res_and_lock(lock); + RETURN(result); +} + +/** * Callback function for lru-resize policy. Makes decision whether to keep * \a lock in LRU for current \a LRU size \a unused, added in current scan * \a added and number of locks to be preferably canceled \a count. @@ -1431,6 +1465,9 @@ typedef ldlm_policy_res_t (*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *, static ldlm_cancel_lru_policy_t ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags) { + if (flags & LDLM_CANCEL_NO_WAIT) + return ldlm_cancel_no_wait_policy; + if (ns_connect_lru_resize(ns)) { if (flags & LDLM_CANCEL_SHRINK) /* We kill passed number of old locks. */ @@ -1472,17 +1509,23 @@ ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags) * memory pressre policy function; * * flags & LDLM_CANCEL_AGED - cancel alocks according to "aged policy". + * + * flags & LDLM_CANCEL_NO_WAIT - cancel as many unused locks as possible + * (typically before replaying locks) w/o + * sending any rpcs or waiting for any + * outstanding rpc to complete. */ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns, cfs_list_t *cancels, int count, int max, int flags) { ldlm_cancel_lru_policy_t pf; struct ldlm_lock *lock, *next; - int added = 0, unused; + int added = 0, unused, remained; ENTRY; cfs_spin_lock(&ns->ns_unused_lock); unused = ns->ns_nr_unused; + remained = unused; if (!ns_connect_lru_resize(ns)) count += unused - ns->ns_max_unused; @@ -1491,6 +1534,12 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns, cfs_list_t *cancels, LASSERT(pf != NULL); while (!cfs_list_empty(&ns->ns_unused_list)) { + ldlm_policy_res_t result; + + /* all unused locks */ + if (remained-- <= 0) + break; + /* For any flags, stop scanning if @max is reached. */ if (max && added >= max) break; @@ -1500,6 +1549,11 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns, cfs_list_t *cancels, /* No locks which got blocking requests. */ LASSERT(!(lock->l_flags & LDLM_FL_BL_AST)); + if (flags & LDLM_CANCEL_NO_WAIT && + lock->l_flags & LDLM_FL_SKIPPED) + /* already processed */ + continue; + /* Somebody is already doing CANCEL. No need in this * lock in lru, do not traverse it again. */ if (!(lock->l_flags & LDLM_FL_CANCELING)) @@ -1527,14 +1581,21 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns, cfs_list_t *cancels, * old locks, but additionally chose them by * their weight. Big extent locks will stay in * the cache. */ - if (pf(ns, lock, unused, added, count) == - LDLM_POLICY_KEEP_LOCK) { + result = pf(ns, lock, unused, added, count); + if (result == LDLM_POLICY_KEEP_LOCK) { lu_ref_del(&lock->l_reference, __FUNCTION__, cfs_current()); LDLM_LOCK_RELEASE(lock); cfs_spin_lock(&ns->ns_unused_lock); break; } + if (result == LDLM_POLICY_SKIP_LOCK) { + lu_ref_del(&lock->l_reference, + __FUNCTION__, cfs_current()); + LDLM_LOCK_RELEASE(lock); + cfs_spin_lock(&ns->ns_unused_lock); + continue; + } lock_res_and_lock(lock); /* Check flags again under the lock. */ @@ -2105,6 +2166,35 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock) RETURN(0); } +/** + * Cancel as many unused locks as possible before replay. since we are + * in recovery, we can't wait for any outstanding RPCs to send any RPC + * to the server. + * + * Called only in recovery before replaying locks. there is no need to + * replay locks that are unused. since the clients may hold thousands of + * cached unused locks, dropping the unused locks can greatly reduce the + * load on the servers at recovery time. + */ +static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns) +{ + int canceled; + CFS_LIST_HEAD(cancels); + + CDEBUG(D_DLMTRACE, "Dropping as many unused locks as possible before" + "replay for namespace %s (%d)\n", ns->ns_name, + ns->ns_nr_unused); + + /* We don't need to care whether or not LRU resize is enabled + * because the LDLM_CANCEL_NO_WAIT policy doesn't use the + * count parameter */ + canceled = ldlm_cancel_lru_local(ns, &cancels, ns->ns_nr_unused, 0, + LCF_LOCAL, LDLM_CANCEL_NO_WAIT); + + CDEBUG(D_DLMTRACE, "Canceled %d unused locks from namespace %s\n", + canceled, ns->ns_name); +} + int ldlm_replay_locks(struct obd_import *imp) { struct ldlm_namespace *ns = imp->imp_obd->obd_namespace; @@ -2123,6 +2213,9 @@ int ldlm_replay_locks(struct obd_import *imp) /* ensure this doesn't fall to 0 before all have been queued */ cfs_atomic_inc(&imp->imp_replay_inflight); + if (ldlm_cancel_unused_locks_before_replay) + ldlm_cancel_unused_locks_for_replay(ns); + (void)ldlm_namespace_foreach(ns, ldlm_chain_lock_for_replay, &list); cfs_list_for_each_entry_safe(lock, next, &list, l_pending_chain) { diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c index 6d0d4a1..2d947fe 100644 --- a/lustre/ldlm/ldlm_resource.c +++ b/lustre/ldlm/ldlm_resource.c @@ -64,6 +64,8 @@ cfs_proc_dir_entry_t *ldlm_type_proc_dir = NULL; cfs_proc_dir_entry_t *ldlm_ns_proc_dir = NULL; cfs_proc_dir_entry_t *ldlm_svc_proc_dir = NULL; +extern unsigned int ldlm_cancel_unused_locks_before_replay; + #ifdef LPROCFS static int ldlm_proc_dump_ns(struct file *file, const char *buffer, unsigned long count, void *data) @@ -78,6 +80,9 @@ int ldlm_proc_setup(void) int rc; struct lprocfs_vars list[] = { { "dump_namespaces", NULL, ldlm_proc_dump_ns, NULL }, + { "cancel_unused_locks_before_replay", + lprocfs_rd_uint, lprocfs_wr_uint, + &ldlm_cancel_unused_locks_before_replay, NULL }, { NULL }}; ENTRY; LASSERT(ldlm_ns_proc_dir == NULL); diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index a1e4044..7c1db00 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -1908,6 +1908,25 @@ struct obd_uuid *mdc_get_uuid(struct obd_export *exp) { return &cli->cl_target_uuid; } +/** + * Determine whether the lock can be canceled before replaying it during + * recovery, non zero value will be return if the lock can be canceled, + * or zero returned for not + */ +static int mdc_cancel_for_recovery(struct ldlm_lock *lock) +{ + if (lock->l_resource->lr_type != LDLM_IBITS) + RETURN(0); + + /* FIXME: if we ever get into a situation where there are too many + * opened files with open locks on a single node, then we really + * should replay these open locks to reget it */ + if (lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_OPEN) + RETURN(0); + + RETURN(1); +} + static int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg) { struct client_obd *cli = &obd->u.cli; @@ -1940,6 +1959,8 @@ static int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg) sptlrpc_lprocfs_cliobd_attach(obd); ptlrpc_lprocfs_register_obd(obd); + ns_register_cancel(obd->obd_namespace, mdc_cancel_for_recovery); + rc = obd_llog_init(obd, &obd->obd_olg, obd, NULL); if (rc) { mdc_cleanup(obd); diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 23fa404..6be1270 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -2073,6 +2073,30 @@ test_84a() { } run_test 84a "stale open during export disconnect" +test_85a() { #bug 16774 + lctl set_param -n ldlm.cancel_unused_locks_before_replay "1" + + for i in `seq 100`; do + echo "tag-$i" > $DIR/$tfile-$i + grep -q "tag-$i" $DIR/$tfile-$i || error "f2-$i" + done + + lov_id=`lctl dl | grep "clilov"` + addr=`echo $lov_id | awk '{print $4}' | awk -F '-' '{print $3}'` + count=`lctl get_param -n ldlm.namespaces.*MDT0000*$addr.lock_unused_count` + echo "before recovery: unused locks count = $count" + + fail $SINGLEMDS + + count2=`lctl get_param -n ldlm.namespaces.*MDT0000*$addr.lock_unused_count` + echo "after recovery: unused locks count = $count2" + + if [ $count2 -ge $count ]; then + error "unused locks are not canceled" + fi +} +run_test 85a "check the cancellation of unused locks during recovery(IBITS)" + test_86() { local clients=${CLIENTS:-$HOSTNAME} -- 1.8.3.1