From fe60e0135ee2334440247cde167b707b223cf11d Mon Sep 17 00:00:00 2001 From: Niu Yawei Date: Thu, 21 May 2015 11:07:54 -0400 Subject: [PATCH] LU-6529 ldlm: reclaim granted locks defensively To avoid ldlm lock exhausting server memory, two global parameters: ldlm_watermark_low & ldlm_watermark_high are used for reclaiming granted locks and rejecting incoming enqueue requests defensively. ldlm_watermark_low: When the amount of granted locks reaching this threshold, server start to revoke locks gradually. ldlm_watermark_high: When the amount of granted locks reaching this threshold, server will return -EINPROGRESS to any incoming enqueue request until the lock count is shrunk below the threshold again. ldlm_watermark_low & ldlm_watermark_high is set to 20% & 30% of the total memory by default. It is tunable via proc entry, when it's set to 0, the feature is disabled. Signed-off-by: Niu Yawei Change-Id: I2fab39ac0ab6f269b7f1a40f3e08b8a51807cc69 Reviewed-on: http://review.whamcloud.com/14931 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Bobi Jam Reviewed-by: Oleg Drokin --- libcfs/include/libcfs/libcfs_hash.h | 2 +- libcfs/libcfs/hash.c | 46 +++-- lustre/include/lustre_dlm.h | 9 + lustre/include/obd_support.h | 3 +- lustre/ldlm/Makefile.am | 2 +- lustre/ldlm/ldlm_internal.h | 9 + lustre/ldlm/ldlm_lock.c | 12 +- lustre/ldlm/ldlm_lockd.c | 18 +- lustre/ldlm/ldlm_pool.c | 16 +- lustre/ldlm/ldlm_reclaim.c | 368 ++++++++++++++++++++++++++++++++++++ lustre/ldlm/ldlm_request.c | 14 +- lustre/ldlm/ldlm_resource.c | 67 ++++++- lustre/mdc/mdc_locks.c | 13 +- lustre/ofd/ofd_dev.c | 1 + lustre/ptlrpc/Makefile.in | 1 + lustre/tests/sanity.sh | 73 +++++++ 16 files changed, 603 insertions(+), 51 deletions(-) create mode 100644 lustre/ldlm/ldlm_reclaim.c diff --git a/libcfs/include/libcfs/libcfs_hash.h b/libcfs/include/libcfs/libcfs_hash.h index 1d07b69..810c785 100644 --- a/libcfs/include/libcfs/libcfs_hash.h +++ b/libcfs/include/libcfs/libcfs_hash.h @@ -710,7 +710,7 @@ void cfs_hash_for_each_safe(struct cfs_hash *hs, cfs_hash_for_each_cb_t, void *data); int cfs_hash_for_each_nolock(struct cfs_hash *hs, cfs_hash_for_each_cb_t, - void *data); + void *data, int start); int cfs_hash_for_each_empty(struct cfs_hash *hs, cfs_hash_for_each_cb_t, void *data); diff --git a/libcfs/libcfs/hash.c b/libcfs/libcfs/hash.c index 504ed55..44bfaf4 100644 --- a/libcfs/libcfs/hash.c +++ b/libcfs/libcfs/hash.c @@ -1586,7 +1586,7 @@ EXPORT_SYMBOL(cfs_hash_size_get); */ static int cfs_hash_for_each_relax(struct cfs_hash *hs, cfs_hash_for_each_cb_t func, - void *data) + void *data, int start) { struct hlist_node *hnode; struct hlist_node *tmp; @@ -1594,19 +1594,25 @@ cfs_hash_for_each_relax(struct cfs_hash *hs, cfs_hash_for_each_cb_t func, __u32 version; int count = 0; int stop_on_change; - int rc; - int i; + int rc = 0; + int i, end = -1; ENTRY; stop_on_change = cfs_hash_with_rehash_key(hs) || !cfs_hash_with_no_itemref(hs) || hs->hs_ops->hs_put_locked == NULL; cfs_hash_lock(hs, 0); +again: LASSERT(!cfs_hash_is_rehashing(hs)); cfs_hash_for_each_bucket(hs, &bd, i) { struct hlist_head *hhead; + if (i < start) + continue; + else if (end > 0 && i >= end) + break; + cfs_hash_bd_lock(hs, &bd, 0); version = cfs_hash_bd_version_get(&bd); @@ -1646,14 +1652,20 @@ cfs_hash_for_each_relax(struct cfs_hash *hs, cfs_hash_for_each_cb_t func, if (rc) /* callback wants to break iteration */ break; } - cfs_hash_unlock(hs, 0); - return count; + if (start > 0 && rc != 0) { + end = start; + start = 0; + goto again; + } + + cfs_hash_unlock(hs, 0); + return count; } int cfs_hash_for_each_nolock(struct cfs_hash *hs, - cfs_hash_for_each_cb_t func, void *data) + cfs_hash_for_each_cb_t func, void *data, int start) { ENTRY; @@ -1667,11 +1679,11 @@ cfs_hash_for_each_nolock(struct cfs_hash *hs, hs->hs_ops->hs_put_locked == NULL)) RETURN(-EOPNOTSUPP); - cfs_hash_for_each_enter(hs); - cfs_hash_for_each_relax(hs, func, data); - cfs_hash_for_each_exit(hs); + cfs_hash_for_each_enter(hs); + cfs_hash_for_each_relax(hs, func, data, start); + cfs_hash_for_each_exit(hs); - RETURN(0); + RETURN(0); } EXPORT_SYMBOL(cfs_hash_for_each_nolock); @@ -1701,13 +1713,13 @@ cfs_hash_for_each_empty(struct cfs_hash *hs, hs->hs_ops->hs_put_locked == NULL)) return -EOPNOTSUPP; - cfs_hash_for_each_enter(hs); - while (cfs_hash_for_each_relax(hs, func, data)) { - CDEBUG(D_INFO, "Try to empty hash: %s, loop: %u\n", - hs->hs_name, i++); - } - cfs_hash_for_each_exit(hs); - RETURN(0); + cfs_hash_for_each_enter(hs); + while (cfs_hash_for_each_relax(hs, func, data, 0)) { + CDEBUG(D_INFO, "Try to empty hash: %s, loop: %u\n", + hs->hs_name, i++); + } + cfs_hash_for_each_exit(hs); + RETURN(0); } EXPORT_SYMBOL(cfs_hash_for_each_empty); diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h index 4839506..57c3c15 100644 --- a/lustre/include/lustre_dlm.h +++ b/lustre/include/lustre_dlm.h @@ -322,6 +322,10 @@ struct ldlm_ns_bucket { * fact the network or overall system load is at fault */ struct adaptive_timeout nsb_at_estimate; + /** + * Which res in the bucket should we start with the reclaim. + */ + int nsb_reclaim_start; }; enum { @@ -507,6 +511,11 @@ struct ldlm_namespace { * recalculation of LDLM pool statistics should be skipped. */ unsigned ns_stopping:1; + + /** + * Which bucket should we start with the lock reclaim. + */ + int ns_reclaim_start; }; /** diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 272b87b..e0f3be1 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -352,10 +352,11 @@ extern char obd_jobid_var[]; #define OBD_FAIL_LDLM_CP_CB_WAIT3 0x321 #define OBD_FAIL_LDLM_CP_CB_WAIT4 0x322 #define OBD_FAIL_LDLM_CP_CB_WAIT5 0x323 - #define OBD_FAIL_LDLM_SRV_BL_AST 0x324 #define OBD_FAIL_LDLM_SRV_CP_AST 0x325 #define OBD_FAIL_LDLM_SRV_GL_AST 0x326 +#define OBD_FAIL_LDLM_WATERMARK_LOW 0x327 +#define OBD_FAIL_LDLM_WATERMARK_HIGH 0x328 /* LOCKLESS IO */ #define OBD_FAIL_LDLM_SET_CONTENTION 0x385 diff --git a/lustre/ldlm/Makefile.am b/lustre/ldlm/Makefile.am index bc996a7..af85299 100644 --- a/lustre/ldlm/Makefile.am +++ b/lustre/ldlm/Makefile.am @@ -42,4 +42,4 @@ MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ EXTRA_DIST = ldlm_extent.c ldlm_flock.c ldlm_internal.h ldlm_lib.c \ ldlm_lock.c ldlm_lockd.c ldlm_plain.c ldlm_request.c \ ldlm_resource.c l_lock.c ldlm_inodebits.c ldlm_pool.c \ - interval_tree.c + interval_tree.c ldlm_reclaim.c diff --git a/lustre/ldlm/ldlm_internal.h b/lustre/ldlm/ldlm_internal.h index cdfbe9c..806f33b 100644 --- a/lustre/ldlm/ldlm_internal.h +++ b/lustre/ldlm/ldlm_internal.h @@ -343,3 +343,12 @@ void ldlm_flock_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy, void ldlm_flock_policy_local_to_wire(const ldlm_policy_data_t *lpolicy, ldlm_wire_policy_data_t *wpolicy); + +/* ldlm_reclaim.c */ +extern __u64 ldlm_watermark_low; +extern __u64 ldlm_watermark_high; +int ldlm_reclaim_setup(void); +void ldlm_reclaim_cleanup(void); +void ldlm_reclaim_add(struct ldlm_lock *lock); +void ldlm_reclaim_del(struct ldlm_lock *lock); +bool ldlm_reclaim_full(void); diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index b3e914c..a324b93 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -2105,13 +2105,13 @@ static int ldlm_reprocess_res(struct cfs_hash *hs, struct cfs_hash_bd *bd, */ void ldlm_reprocess_all_ns(struct ldlm_namespace *ns) { - ENTRY; + ENTRY; - if (ns != NULL) { - cfs_hash_for_each_nolock(ns->ns_rs_hash, - ldlm_reprocess_res, NULL); - } - EXIT; + if (ns != NULL) { + cfs_hash_for_each_nolock(ns->ns_rs_hash, + ldlm_reprocess_res, NULL, 0); + } + EXIT; } /** diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index cf55d08..5f45863 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -1265,7 +1265,14 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns, flags |= LDLM_FL_RESENT; GOTO(existing_lock, rc = 0); } - } + } else { + if (ldlm_reclaim_full()) { + DEBUG_REQ(D_DLMTRACE, req, "Too many granted locks, " + "reject current enqueue request and let the " + "client retry later.\n"); + GOTO(out, rc = -EINPROGRESS); + } + } /* The lock's callback data might be set in the policy function */ lock = ldlm_lock_create(ns, &dlm_req->lock_desc.l_resource.lr_name, @@ -2942,6 +2949,12 @@ static int ldlm_setup(void) CERROR("Failed to initialize LDLM pools: %d\n", rc); GOTO(out, rc); } + + rc = ldlm_reclaim_setup(); + if (rc) { + CERROR("Failed to setup reclaim thread: rc = %d\n", rc); + GOTO(out, rc); + } RETURN(0); out: @@ -2961,7 +2974,8 @@ static int ldlm_cleanup(void) RETURN(-EBUSY); } - ldlm_pools_fini(); + ldlm_reclaim_cleanup(); + ldlm_pools_fini(); if (ldlm_state->ldlm_bl_pool != NULL) { struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool; diff --git a/lustre/ldlm/ldlm_pool.c b/lustre/ldlm/ldlm_pool.c index ea1971f..f75af5c 100644 --- a/lustre/ldlm/ldlm_pool.c +++ b/lustre/ldlm/ldlm_pool.c @@ -912,10 +912,16 @@ void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock) * cancelled, instead special kind of lock is used to drop them. * also there is no LRU for flock locks, so no point in tracking * them anyway. + * + * PLAIN locks are used by config and quota, the quantity is small + * and usually they are not in LRU. */ - if (lock->l_resource->lr_type == LDLM_FLOCK) + if (lock->l_resource->lr_type == LDLM_FLOCK || + lock->l_resource->lr_type == LDLM_PLAIN) return; + ldlm_reclaim_add(lock); + atomic_inc(&pl->pl_granted); atomic_inc(&pl->pl_grant_rate); lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_GRANT_STAT); @@ -935,11 +941,15 @@ void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock) void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock) { /* - * Filter out FLOCK locks. Read above comment in ldlm_pool_add(). + * Filter out FLOCK & PLAIN locks. Read above comment in + * ldlm_pool_add(). */ - if (lock->l_resource->lr_type == LDLM_FLOCK) + if (lock->l_resource->lr_type == LDLM_FLOCK || + lock->l_resource->lr_type == LDLM_PLAIN) return; + ldlm_reclaim_del(lock); + LASSERT(atomic_read(&pl->pl_granted) > 0); atomic_dec(&pl->pl_granted); atomic_inc(&pl->pl_cancel_rate); diff --git a/lustre/ldlm/ldlm_reclaim.c b/lustre/ldlm/ldlm_reclaim.c new file mode 100644 index 0000000..01ec441 --- /dev/null +++ b/lustre/ldlm/ldlm_reclaim.c @@ -0,0 +1,368 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2015, Intel Corporation. + * Use is subject to license terms. + * + * Author: Niu Yawei + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include +#include +#include +#include "ldlm_internal.h" + +/* + * To avoid ldlm lock exhausting server memory, two global parameters: + * ldlm_watermark_low & ldlm_watermark_high are used for reclaiming + * granted locks and rejecting incoming enqueue requests defensively. + * + * ldlm_watermark_low: When the amount of granted locks reaching this + * threshold, server start to revoke locks gradually. + * + * ldlm_watermark_high: When the amount of granted locks reaching this + * threshold, server will return -EINPROGRESS to any incoming enqueue + * request until the lock count is shrunk below the threshold again. + * + * ldlm_watermark_low & ldlm_watermark_high is set to 20% & 30% of the + * total memory by default. It is tunable via proc entry, when it's set + * to 0, the feature is disabled. + */ + +/* + * FIXME: + * + * In current implementation, server identifies which locks should be + * revoked by choosing locks from namespace/resource in a roundrobin + * manner, which isn't optimal. The ideal way should be server notifies + * clients to cancel locks voluntarily, because only client knows exactly + * when the lock is last used. + * + * However how to notify client immediately is a problem, one idea + * is to leverage the glimplse callbacks on some artificial global + * lock (like quota global lock does), but that requires protocol + * changes, let's fix it in future long-term solution. + */ + +__u64 ldlm_watermark_low; +__u64 ldlm_watermark_high; + +#ifdef HAVE_SERVER_SUPPORT + +static struct percpu_counter ldlm_granted_total; +static atomic_t ldlm_nr_reclaimer; +static cfs_duration_t ldlm_last_reclaim_age; +static cfs_time_t ldlm_last_reclaim_time; + +struct ldlm_reclaim_cb_data { + struct list_head rcd_rpc_list; + int rcd_added; + int rcd_total; + int rcd_cursor; + int rcd_start; + bool rcd_skip; + cfs_duration_t rcd_age; + struct cfs_hash_bd *rcd_prev_bd; +}; + +static inline bool ldlm_lock_reclaimable(struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + /* FLOCK & PLAIN lock are not reclaimable. FLOCK is + * explicitly controlled by application, PLAIN lock + * is used by quota global lock and config lock. + */ + if (ns->ns_client == LDLM_NAMESPACE_SERVER && + (lock->l_resource->lr_type == LDLM_IBITS || + lock->l_resource->lr_type == LDLM_EXTENT)) + return true; + return false; +} + +static int ldlm_reclaim_lock_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd, + struct hlist_node *hnode, void *arg) + +{ + struct ldlm_resource *res; + struct ldlm_reclaim_cb_data *data; + struct ldlm_lock *lock; + struct ldlm_ns_bucket *nsb; + int rc = 0; + + data = (struct ldlm_reclaim_cb_data *)arg; + + LASSERTF(data->rcd_added < data->rcd_total, "added:%d >= total:%d\n", + data->rcd_added, data->rcd_total); + + nsb = cfs_hash_bd_extra_get(hs, bd); + res = cfs_hash_object(hs, hnode); + + if (data->rcd_prev_bd != bd) { + if (data->rcd_prev_bd != NULL) + ldlm_res_to_ns(res)->ns_reclaim_start++; + data->rcd_prev_bd = bd; + data->rcd_cursor = 0; + data->rcd_start = nsb->nsb_reclaim_start % + cfs_hash_bd_count_get(bd); + } + + if (data->rcd_skip && data->rcd_cursor < data->rcd_start) { + data->rcd_cursor++; + return 0; + } + + nsb->nsb_reclaim_start++; + + lock_res(res); + list_for_each_entry(lock, &res->lr_granted, l_res_link) { + if (!ldlm_lock_reclaimable(lock)) + continue; + + if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_LOW) && + cfs_time_before(cfs_time_current(), + cfs_time_add(lock->l_last_used, + data->rcd_age))) + continue; + + if (!ldlm_is_ast_sent(lock)) { + ldlm_set_ast_sent(lock); + LASSERT(list_empty(&lock->l_rk_ast)); + list_add(&lock->l_rk_ast, &data->rcd_rpc_list); + LDLM_LOCK_GET(lock); + if (++data->rcd_added == data->rcd_total) { + rc = 1; /* stop the iteration */ + break; + } + } + } + unlock_res(res); + + return rc; +} + +static void ldlm_reclaim_res(struct ldlm_namespace *ns, int *count, + cfs_duration_t age, bool skip) +{ + struct ldlm_reclaim_cb_data data; + int idx, type, start; + ENTRY; + + LASSERT(*count != 0); + + if (ns->ns_obd) { + type = server_name2index(ns->ns_obd->obd_name, &idx, NULL); + if (type != LDD_F_SV_TYPE_MDT && type != LDD_F_SV_TYPE_OST) { + EXIT; + return; + } + } + + if (atomic_read(&ns->ns_bref) == 0) { + EXIT; + return; + } + + INIT_LIST_HEAD(&data.rcd_rpc_list); + data.rcd_added = 0; + data.rcd_total = *count; + data.rcd_age = age; + data.rcd_skip = skip; + data.rcd_prev_bd = NULL; + start = ns->ns_reclaim_start % CFS_HASH_NBKT(ns->ns_rs_hash); + + cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_reclaim_lock_cb, &data, + start); + + CDEBUG(D_DLMTRACE, "NS(%s): %d locks to be reclaimed, found %d/%d " + "locks.\n", ldlm_ns_name(ns), *count, data.rcd_added, + data.rcd_total); + + LASSERTF(*count >= data.rcd_added, "count:%d, added:%d\n", *count, + data.rcd_added); + + ldlm_run_ast_work(ns, &data.rcd_rpc_list, LDLM_WORK_REVOKE_AST); + *count -= data.rcd_added; + EXIT; +} + +#define LDLM_RECLAIM_BATCH 512 +#define LDLM_RECLAIM_AGE_MIN cfs_time_seconds(300) +#define LDLM_RECLAIM_AGE_MAX (LDLM_DEFAULT_MAX_ALIVE * 3 / 4) + +static inline cfs_duration_t ldlm_reclaim_age(void) +{ + cfs_duration_t age; + + age = ldlm_last_reclaim_age + + cfs_time_sub(cfs_time_current(), ldlm_last_reclaim_time); + if (age > LDLM_RECLAIM_AGE_MAX) + age = LDLM_RECLAIM_AGE_MAX; + else if (age < (LDLM_RECLAIM_AGE_MIN * 2)) + age = LDLM_RECLAIM_AGE_MIN; + return age; +} + +static void ldlm_reclaim_ns(void) +{ + struct ldlm_namespace *ns; + int count = LDLM_RECLAIM_BATCH; + int ns_nr, nr_processed; + ldlm_side_t ns_cli = LDLM_NAMESPACE_SERVER; + cfs_duration_t age; + bool skip = true; + ENTRY; + + if (!atomic_add_unless(&ldlm_nr_reclaimer, 1, 1)) { + EXIT; + return; + } + + age = ldlm_reclaim_age(); +again: + nr_processed = 0; + ns_nr = ldlm_namespace_nr_read(ns_cli); + while (count > 0 && nr_processed < ns_nr) { + mutex_lock(ldlm_namespace_lock(ns_cli)); + + if (list_empty(ldlm_namespace_list(ns_cli))) { + mutex_unlock(ldlm_namespace_lock(ns_cli)); + goto out; + } + + ns = ldlm_namespace_first_locked(ns_cli); + ldlm_namespace_move_to_active_locked(ns, ns_cli); + mutex_unlock(ldlm_namespace_lock(ns_cli)); + + ldlm_reclaim_res(ns, &count, age, skip); + ldlm_namespace_put(ns); + nr_processed++; + } + + if (count > 0 && age > LDLM_RECLAIM_AGE_MIN) { + age >>= 1; + if (age < (LDLM_RECLAIM_AGE_MIN * 2)) + age = LDLM_RECLAIM_AGE_MIN; + skip = false; + goto again; + } + + ldlm_last_reclaim_age = age; + ldlm_last_reclaim_time = cfs_time_current(); +out: + atomic_add_unless(&ldlm_nr_reclaimer, -1, 0); + EXIT; +} + +void ldlm_reclaim_add(struct ldlm_lock *lock) +{ + if (!ldlm_lock_reclaimable(lock)) + return; + percpu_counter_add(&ldlm_granted_total, 1); + lock->l_last_used = cfs_time_current(); +} + +void ldlm_reclaim_del(struct ldlm_lock *lock) +{ + if (!ldlm_lock_reclaimable(lock)) + return; + percpu_counter_sub(&ldlm_granted_total, 1); +} + +bool ldlm_reclaim_full(void) +{ + __u64 high = ldlm_watermark_high; + __u64 low = ldlm_watermark_low; + + if (low != 0 && OBD_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_LOW)) + low = cfs_fail_val; + + if (low != 0 && + percpu_counter_read_positive(&ldlm_granted_total) > low) + ldlm_reclaim_ns(); + + if (high != 0 && OBD_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_HIGH)) + high = cfs_fail_val; + + if (high != 0 && + percpu_counter_read_positive(&ldlm_granted_total) > high) + return true; + + return false; +} + +static inline __u64 ldlm_ratio2locknr(int ratio) +{ + __u64 locknr; + + locknr = ((__u64)NUM_CACHEPAGES << PAGE_CACHE_SHIFT) * ratio; + do_div(locknr, 100 * sizeof(struct ldlm_lock)); + + return locknr; +} + +#define LDLM_WM_RATIO_LOW_DEFAULT 20 +#define LDLM_WM_RATIO_HIGH_DEFAULT 30 + +int ldlm_reclaim_setup(void) +{ + atomic_set(&ldlm_nr_reclaimer, 0); + ldlm_watermark_low = ldlm_ratio2locknr(LDLM_WM_RATIO_LOW_DEFAULT); + ldlm_watermark_high = ldlm_ratio2locknr(LDLM_WM_RATIO_HIGH_DEFAULT); + ldlm_last_reclaim_age = LDLM_RECLAIM_AGE_MAX; + ldlm_last_reclaim_time = cfs_time_current(); + + return percpu_counter_init(&ldlm_granted_total, 0); +} + +void ldlm_reclaim_cleanup(void) +{ + percpu_counter_destroy(&ldlm_granted_total); +} + +#else /* HAVE_SERVER_SUPPORT */ + +bool ldlm_reclaim_full(void) +{ + return false; +} + +void ldlm_reclaim_add(struct ldlm_lock *lock) +{ +} + +void ldlm_reclaim_del(struct ldlm_lock *lock) +{ +} + +int ldlm_reclaim_setup(void) +{ + return 0; +} + +void ldlm_reclaim_cleanup(void) +{ +} + +#endif /* HAVE_SERVER_SUPPORT */ diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index d3049c0..7b90e49 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -2014,11 +2014,11 @@ int ldlm_cli_cancel_unused(struct ldlm_namespace *ns, RETURN(ldlm_cli_cancel_unused_resource(ns, res_id, NULL, LCK_MINMODE, flags, opaque)); - } else { - cfs_hash_for_each_nolock(ns->ns_rs_hash, - ldlm_cli_hash_cancel_unused, &arg); - RETURN(ELDLM_OK); - } + } else { + cfs_hash_for_each_nolock(ns->ns_rs_hash, + ldlm_cli_hash_cancel_unused, &arg, 0); + RETURN(ELDLM_OK); + } } /* Lock iterators. */ @@ -2088,8 +2088,8 @@ void ldlm_namespace_foreach(struct ldlm_namespace *ns, { struct iter_helper_data helper = { .iter = iter, .closure = closure }; - cfs_hash_for_each_nolock(ns->ns_rs_hash, - ldlm_res_iter_helper, &helper); + cfs_hash_for_each_nolock(ns->ns_rs_hash, + ldlm_res_iter_helper, &helper, 0); } diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c index fd7b65d..084ad80 100644 --- a/lustre/ldlm/ldlm_resource.c +++ b/lustre/ldlm/ldlm_resource.c @@ -84,6 +84,55 @@ LPROC_SEQ_FOPS_WO_TYPE(ldlm, dump_ns); LPROC_SEQ_FOPS_RW_TYPE(ldlm_rw, uint); LPROC_SEQ_FOPS_RO_TYPE(ldlm, uint); +/* Lock count is stored in the watermark, and it's display as number of MB + * memory consumed by the locks */ +static int seq_watermark_show(struct seq_file *m, void *data) +{ + __u64 locknr = *(__u64 *)m->private; + return seq_printf(m, LPU64"\n", + (locknr * sizeof(struct ldlm_lock)) >> 20); +} + +static ssize_t seq_watermark_write(struct file *file, + const char __user *buffer, size_t count, + loff_t *off) +{ + __u64 watermark; + __u64 *data = ((struct seq_file *)file->private_data)->private; + int rc; + + rc = lprocfs_write_frac_u64_helper(buffer, count, &watermark, 1 << 20); + if (rc) { + CERROR("Failed to set LDLM watermark, rc = %d.\n", rc); + return rc; + } else if (watermark != 0 && watermark < (1 << 20)) { + CERROR("Watermark should be greater than 1MB.\n"); + return -EINVAL; + } + + do_div(watermark, sizeof(struct ldlm_lock)); + *data = watermark; + + if (ldlm_watermark_low != 0 && ldlm_watermark_high != 0 && + ldlm_watermark_low > ldlm_watermark_high) + ldlm_watermark_low = ldlm_watermark_high; + return count; +} + +static int seq_watermark_open(struct inode *inode, struct file *file) +{ + return single_open(file, seq_watermark_show, PDE_DATA(inode)); +} + +static const struct file_operations ldlm_watermark_fops = { + .owner = THIS_MODULE, + .open = seq_watermark_open, + .read = seq_read, + .write = seq_watermark_write, + .llseek = seq_lseek, + .release = lprocfs_single_release, +}; + int ldlm_proc_setup(void) { int rc; @@ -97,6 +146,12 @@ int ldlm_proc_setup(void) { .name = "cancel_unused_locks_before_replay", .fops = &ldlm_rw_uint_fops, .data = &ldlm_cancel_unused_locks_before_replay }, + { .name = "watermark_mb_low", + .fops = &ldlm_watermark_fops, + .data = &ldlm_watermark_low }, + { .name = "watermark_mb_high", + .fops = &ldlm_watermark_fops, + .data = &ldlm_watermark_high }, { NULL }}; ENTRY; LASSERT(ldlm_ns_proc_dir == NULL); @@ -613,6 +668,7 @@ struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name, nsb = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd); at_init(&nsb->nsb_at_estimate, ldlm_enqueue_min, 0); nsb->nsb_namespace = ns; + nsb->nsb_reclaim_start = 0; } ns->ns_obd = obd; @@ -638,6 +694,7 @@ struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name, ns->ns_orig_connect_flags = 0; ns->ns_connect_flags = 0; ns->ns_stopping = 0; + ns->ns_reclaim_start = 0; rc = ldlm_namespace_proc_register(ns); if (rc != 0) { CERROR("Can't initialize ns proc, rc %d\n", rc); @@ -797,9 +854,11 @@ int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags) return ELDLM_OK; } - cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_clean, &flags); - cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_complain, NULL); - return ELDLM_OK; + cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_clean, + &flags, 0); + cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_complain, + NULL, 0); + return ELDLM_OK; } EXPORT_SYMBOL(ldlm_namespace_cleanup); @@ -1365,7 +1424,7 @@ void ldlm_namespace_dump(int level, struct ldlm_namespace *ns) cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_res_hash_dump, - (void *)(unsigned long)level); + (void *)(unsigned long)level, 0); spin_lock(&ns->ns_lock); ns->ns_next_dump = cfs_time_shift(10); spin_unlock(&ns->ns_lock); diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c index 9a17940..9ebef67 100644 --- a/lustre/mdc/mdc_locks.c +++ b/lustre/mdc/mdc_locks.c @@ -817,11 +817,6 @@ resend: if (IS_ERR(req)) RETURN(PTR_ERR(req)); - if (req != NULL && it && it->it_op & IT_CREAT) - /* ask ptlrpc not to resend on EINPROGRESS since we have our own - * retry logic */ - req->rq_no_retry_einprogress = 1; - if (resends) { req->rq_generation_set = 1; req->rq_import_generation = generation; @@ -879,10 +874,10 @@ resend: lockrep->lock_policy_res2 = ptlrpc_status_ntoh(lockrep->lock_policy_res2); - /* Retry the create infinitely when we get -EINPROGRESS from - * server. This is required by the new quota design. */ - if (it && it->it_op & IT_CREAT && - (int)lockrep->lock_policy_res2 == -EINPROGRESS) { + /* Retry infinitely when the server returns -EINPROGRESS for the + * intent operation, when server returns -EINPROGRESS for acquiring + * intent lock, we'll retry in after_reply(). */ + if (it && (int)lockrep->lock_policy_res2 == -EINPROGRESS) { mdc_clear_replay_flag(req, rc); ptlrpc_req_finished(req); resends++; diff --git a/lustre/ofd/ofd_dev.c b/lustre/ofd/ofd_dev.c index c4b3c9b..631987e 100644 --- a/lustre/ofd/ofd_dev.c +++ b/lustre/ofd/ofd_dev.c @@ -2269,6 +2269,7 @@ static int ofd_prolong_extent_locks(struct tgt_session_info *tsi, LDLM_LOCK_PUT(lock); RETURN(lock_count); } + lock->l_last_used = cfs_time_current(); LDLM_LOCK_PUT(lock); } } diff --git a/lustre/ptlrpc/Makefile.in b/lustre/ptlrpc/Makefile.in index 37c86cf..7521a98 100644 --- a/lustre/ptlrpc/Makefile.in +++ b/lustre/ptlrpc/Makefile.in @@ -8,6 +8,7 @@ ldlm_objs += $(LDLM)ldlm_plain.o $(LDLM)ldlm_extent.o ldlm_objs += $(LDLM)ldlm_request.o $(LDLM)ldlm_lockd.o ldlm_objs += $(LDLM)ldlm_flock.o $(LDLM)ldlm_inodebits.o ldlm_objs += $(LDLM)ldlm_pool.o $(LDLM)interval_tree.o +ldlm_objs += $(LDLM)ldlm_reclaim.o target_objs := $(TARGET)tgt_main.o $(TARGET)tgt_lastrcvd.o target_objs += $(TARGET)tgt_handler.o $(TARGET)out_handler.o diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index bbf65c3..42562f5 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -9374,6 +9374,79 @@ test_133g() { } run_test 133g "Check for Oopses on bad io area writes/reads in /proc" +test_134a() { + [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.54) ]] && + skip "Need MDS version at least 2.7.54" && return + + mkdir -p $DIR/$tdir || error "failed to create $DIR/$tdir" + cancel_lru_locks mdc + + local nsdir="ldlm.namespaces.*-MDT0000-mdc-*" + local unused=$($LCTL get_param -n $nsdir.lock_unused_count) + [ $unused -eq 0 ] || "$unused locks are not cleared" + + local nr=1000 + createmany -o $DIR/$tdir/f $nr || + error "failed to create $nr files in $DIR/$tdir" + unused=$($LCTL get_param -n $nsdir.lock_unused_count) + + #define OBD_FAIL_LDLM_WATERMARK_LOW 0x327 + do_facet mds1 $LCTL set_param fail_loc=0x327 + do_facet mds1 $LCTL set_param fail_val=500 + touch $DIR/$tdir/m + + echo "sleep 10 seconds ..." + sleep 10 + local lck_cnt=$($LCTL get_param -n $nsdir.lock_unused_count) + + do_facet mds1 $LCTL set_param fail_loc=0 + do_facet mds1 $LCTL set_param fail_val=0 + [ $lck_cnt -lt $unused ] || + error "No locks reclaimed, before:$unused, after:$lck_cnt" + + rm $DIR/$tdir/m + unlinkmany $DIR/$tdir/f $nr +} +run_test 134a "Server reclaims locks when reaching low watermark" + +test_134b() { + [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.54) ]] && + skip "Need MDS version at least 2.7.54" && return + + mkdir -p $DIR/$tdir || error "failed to create $DIR/$tdir" + cancel_lru_locks mdc + + local low_wm=$(do_facet mds1 $LCTL get_param -n ldlm.watermark_mb_low) + # disable reclaim temporarily + do_facet mds1 $LCTL set_param ldlm.watermark_mb_low=0 + + #define OBD_FAIL_LDLM_WATERMARK_HIGH 0x328 + do_facet mds1 $LCTL set_param fail_loc=0x328 + do_facet mds1 $LCTL set_param fail_val=500 + + $LCTL set_param debug=+trace + + local nr=600 + createmany -o $DIR/$tdir/f $nr & + local create_pid=$! + + echo "Sleep $TIMEOUT seconds ..." + sleep $TIMEOUT + if ! ps -p $create_pid > /dev/null 2>&1; then + do_facet mds1 $LCTL set_param fail_loc=0 + do_facet mds1 $LCTL set_param fail_val=0 + do_facet mds1 $LCTL set_param ldlm.watermark_mb_low=$low_wm + error "createmany finished incorrectly!" + fi + do_facet mds1 $LCTL set_param fail_loc=0 + do_facet mds1 $LCTL set_param fail_val=0 + do_facet mds1 $LCTL set_param ldlm.watermark_mb_low=$low_wm + wait $create_pid || return 1 + + unlinkmany $DIR/$tdir/f $nr +} +run_test 134b "Server rejects lock request when reaching high watermark" + test_140() { #bug-17379 [ $PARALLEL == "yes" ] && skip "skip parallel run" && return test_mkdir -p $DIR/$tdir || error "Creating dir $DIR/$tdir" -- 1.8.3.1