From fe60e0135ee2334440247cde167b707b223cf11d Mon Sep 17 00:00:00 2001
From: Niu Yawei <yawei.niu@intel.com>
Date: Thu, 21 May 2015 11:07:54 -0400
Subject: [PATCH] LU-6529 ldlm: reclaim granted locks defensively

To avoid ldlm lock exhausting server memory, two global parameters:
ldlm_watermark_low & ldlm_watermark_high are used for reclaiming
granted locks and rejecting incoming enqueue requests defensively.

ldlm_watermark_low: When the amount of granted locks reaching this
threshold, server start to revoke locks gradually.

ldlm_watermark_high: When the amount of granted locks reaching this
threshold, server will return -EINPROGRESS to any incoming enqueue
request until the lock count is shrunk below the threshold again.

ldlm_watermark_low & ldlm_watermark_high is set to 20% & 30% of the
total memory by default. It is tunable via proc entry, when it's set
to 0, the feature is disabled.

Signed-off-by: Niu Yawei <yawei.niu@intel.com>
Change-Id: I2fab39ac0ab6f269b7f1a40f3e08b8a51807cc69
Reviewed-on: http://review.whamcloud.com/14931
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Bobi Jam <bobijam@hotmail.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
---
 libcfs/include/libcfs/libcfs_hash.h |   2 +-
 libcfs/libcfs/hash.c                |  46 +++--
 lustre/include/lustre_dlm.h         |   9 +
 lustre/include/obd_support.h        |   3 +-
 lustre/ldlm/Makefile.am             |   2 +-
 lustre/ldlm/ldlm_internal.h         |   9 +
 lustre/ldlm/ldlm_lock.c             |  12 +-
 lustre/ldlm/ldlm_lockd.c            |  18 +-
 lustre/ldlm/ldlm_pool.c             |  16 +-
 lustre/ldlm/ldlm_reclaim.c          | 368 ++++++++++++++++++++++++++++++++++++
 lustre/ldlm/ldlm_request.c          |  14 +-
 lustre/ldlm/ldlm_resource.c         |  67 ++++++-
 lustre/mdc/mdc_locks.c              |  13 +-
 lustre/ofd/ofd_dev.c                |   1 +
 lustre/ptlrpc/Makefile.in           |   1 +
 lustre/tests/sanity.sh              |  73 +++++++
 16 files changed, 603 insertions(+), 51 deletions(-)
 create mode 100644 lustre/ldlm/ldlm_reclaim.c

diff --git a/libcfs/include/libcfs/libcfs_hash.h b/libcfs/include/libcfs/libcfs_hash.h
index 1d07b69..810c785 100644
--- a/libcfs/include/libcfs/libcfs_hash.h
+++ b/libcfs/include/libcfs/libcfs_hash.h
@@ -710,7 +710,7 @@ void
 cfs_hash_for_each_safe(struct cfs_hash *hs, cfs_hash_for_each_cb_t, void *data);
 int
 cfs_hash_for_each_nolock(struct cfs_hash *hs, cfs_hash_for_each_cb_t,
-			 void *data);
+			 void *data, int start);
 int
 cfs_hash_for_each_empty(struct cfs_hash *hs, cfs_hash_for_each_cb_t,
 			void *data);
diff --git a/libcfs/libcfs/hash.c b/libcfs/libcfs/hash.c
index 504ed55..44bfaf4 100644
--- a/libcfs/libcfs/hash.c
+++ b/libcfs/libcfs/hash.c
@@ -1586,7 +1586,7 @@ EXPORT_SYMBOL(cfs_hash_size_get);
  */
 static int
 cfs_hash_for_each_relax(struct cfs_hash *hs, cfs_hash_for_each_cb_t func,
-			void *data)
+			void *data, int start)
 {
 	struct hlist_node	*hnode;
 	struct hlist_node	*tmp;
@@ -1594,19 +1594,25 @@ cfs_hash_for_each_relax(struct cfs_hash *hs, cfs_hash_for_each_cb_t func,
 	__u32			version;
 	int			count = 0;
 	int			stop_on_change;
-	int			rc;
-	int			i;
+	int			rc = 0;
+	int			i, end = -1;
 	ENTRY;
 
 	stop_on_change = cfs_hash_with_rehash_key(hs) ||
 			 !cfs_hash_with_no_itemref(hs) ||
 			 hs->hs_ops->hs_put_locked == NULL;
 	cfs_hash_lock(hs, 0);
+again:
 	LASSERT(!cfs_hash_is_rehashing(hs));
 
 	cfs_hash_for_each_bucket(hs, &bd, i) {
 		struct hlist_head *hhead;
 
+		if (i < start)
+			continue;
+		else if (end > 0 && i >= end)
+			break;
+
                 cfs_hash_bd_lock(hs, &bd, 0);
                 version = cfs_hash_bd_version_get(&bd);
 
@@ -1646,14 +1652,20 @@ cfs_hash_for_each_relax(struct cfs_hash *hs, cfs_hash_for_each_cb_t func,
 		if (rc) /* callback wants to break iteration */
 			break;
         }
-        cfs_hash_unlock(hs, 0);
 
-        return count;
+	if (start > 0 && rc != 0) {
+		end = start;
+		start = 0;
+		goto again;
+	}
+
+	cfs_hash_unlock(hs, 0);
+	return count;
 }
 
 int
 cfs_hash_for_each_nolock(struct cfs_hash *hs,
-                         cfs_hash_for_each_cb_t func, void *data)
+			 cfs_hash_for_each_cb_t func, void *data, int start)
 {
         ENTRY;
 
@@ -1667,11 +1679,11 @@ cfs_hash_for_each_nolock(struct cfs_hash *hs,
 	    hs->hs_ops->hs_put_locked == NULL))
 		RETURN(-EOPNOTSUPP);
 
-        cfs_hash_for_each_enter(hs);
-        cfs_hash_for_each_relax(hs, func, data);
-        cfs_hash_for_each_exit(hs);
+	cfs_hash_for_each_enter(hs);
+	cfs_hash_for_each_relax(hs, func, data, start);
+	cfs_hash_for_each_exit(hs);
 
-        RETURN(0);
+	RETURN(0);
 }
 EXPORT_SYMBOL(cfs_hash_for_each_nolock);
 
@@ -1701,13 +1713,13 @@ cfs_hash_for_each_empty(struct cfs_hash *hs,
 	    hs->hs_ops->hs_put_locked == NULL))
 		return -EOPNOTSUPP;
 
-        cfs_hash_for_each_enter(hs);
-        while (cfs_hash_for_each_relax(hs, func, data)) {
-                CDEBUG(D_INFO, "Try to empty hash: %s, loop: %u\n",
-                       hs->hs_name, i++);
-        }
-        cfs_hash_for_each_exit(hs);
-        RETURN(0);
+	cfs_hash_for_each_enter(hs);
+	while (cfs_hash_for_each_relax(hs, func, data, 0)) {
+		CDEBUG(D_INFO, "Try to empty hash: %s, loop: %u\n",
+		       hs->hs_name, i++);
+	}
+	cfs_hash_for_each_exit(hs);
+	RETURN(0);
 }
 EXPORT_SYMBOL(cfs_hash_for_each_empty);
 
diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h
index 4839506..57c3c15 100644
--- a/lustre/include/lustre_dlm.h
+++ b/lustre/include/lustre_dlm.h
@@ -322,6 +322,10 @@ struct ldlm_ns_bucket {
 	 * fact the network or overall system load is at fault
 	 */
 	struct adaptive_timeout     nsb_at_estimate;
+	/**
+	 * Which res in the bucket should we start with the reclaim.
+	 */
+	int			    nsb_reclaim_start;
 };
 
 enum {
@@ -507,6 +511,11 @@ struct ldlm_namespace {
 	 * recalculation of LDLM pool statistics should be skipped.
 	 */
 	unsigned		ns_stopping:1;
+
+	/**
+	 * Which bucket should we start with the lock reclaim.
+	 */
+	int			ns_reclaim_start;
 };
 
 /**
diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h
index 272b87b..e0f3be1 100644
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -352,10 +352,11 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_LDLM_CP_CB_WAIT3        0x321
 #define OBD_FAIL_LDLM_CP_CB_WAIT4        0x322
 #define OBD_FAIL_LDLM_CP_CB_WAIT5        0x323
-
 #define OBD_FAIL_LDLM_SRV_BL_AST	 0x324
 #define OBD_FAIL_LDLM_SRV_CP_AST	 0x325
 #define OBD_FAIL_LDLM_SRV_GL_AST	 0x326
+#define OBD_FAIL_LDLM_WATERMARK_LOW	 0x327
+#define OBD_FAIL_LDLM_WATERMARK_HIGH	 0x328
 
 /* LOCKLESS IO */
 #define OBD_FAIL_LDLM_SET_CONTENTION     0x385
diff --git a/lustre/ldlm/Makefile.am b/lustre/ldlm/Makefile.am
index bc996a7..af85299 100644
--- a/lustre/ldlm/Makefile.am
+++ b/lustre/ldlm/Makefile.am
@@ -42,4 +42,4 @@ MOSTLYCLEANFILES := @MOSTLYCLEANFILES@
 EXTRA_DIST = ldlm_extent.c ldlm_flock.c ldlm_internal.h ldlm_lib.c \
 	ldlm_lock.c ldlm_lockd.c ldlm_plain.c ldlm_request.c	     \
 	ldlm_resource.c l_lock.c ldlm_inodebits.c ldlm_pool.c 	     \
-	interval_tree.c
+	interval_tree.c ldlm_reclaim.c
diff --git a/lustre/ldlm/ldlm_internal.h b/lustre/ldlm/ldlm_internal.h
index cdfbe9c..806f33b 100644
--- a/lustre/ldlm/ldlm_internal.h
+++ b/lustre/ldlm/ldlm_internal.h
@@ -343,3 +343,12 @@ void ldlm_flock_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
 
 void ldlm_flock_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
                                      ldlm_wire_policy_data_t *wpolicy);
+
+/* ldlm_reclaim.c */
+extern __u64 ldlm_watermark_low;
+extern __u64 ldlm_watermark_high;
+int ldlm_reclaim_setup(void);
+void ldlm_reclaim_cleanup(void);
+void ldlm_reclaim_add(struct ldlm_lock *lock);
+void ldlm_reclaim_del(struct ldlm_lock *lock);
+bool ldlm_reclaim_full(void);
diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c
index b3e914c..a324b93 100644
--- a/lustre/ldlm/ldlm_lock.c
+++ b/lustre/ldlm/ldlm_lock.c
@@ -2105,13 +2105,13 @@ static int ldlm_reprocess_res(struct cfs_hash *hs, struct cfs_hash_bd *bd,
  */
 void ldlm_reprocess_all_ns(struct ldlm_namespace *ns)
 {
-        ENTRY;
+	ENTRY;
 
-        if (ns != NULL) {
-                cfs_hash_for_each_nolock(ns->ns_rs_hash,
-                                         ldlm_reprocess_res, NULL);
-        }
-        EXIT;
+	if (ns != NULL) {
+		cfs_hash_for_each_nolock(ns->ns_rs_hash,
+					 ldlm_reprocess_res, NULL, 0);
+	}
+	EXIT;
 }
 
 /**
diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c
index cf55d08..5f45863 100644
--- a/lustre/ldlm/ldlm_lockd.c
+++ b/lustre/ldlm/ldlm_lockd.c
@@ -1265,7 +1265,14 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
 			flags |= LDLM_FL_RESENT;
                         GOTO(existing_lock, rc = 0);
 		}
-        }
+	} else {
+		if (ldlm_reclaim_full()) {
+			DEBUG_REQ(D_DLMTRACE, req, "Too many granted locks, "
+				  "reject current enqueue request and let the "
+				  "client retry later.\n");
+			GOTO(out, rc = -EINPROGRESS);
+		}
+	}
 
 	/* The lock's callback data might be set in the policy function */
 	lock = ldlm_lock_create(ns, &dlm_req->lock_desc.l_resource.lr_name,
@@ -2942,6 +2949,12 @@ static int ldlm_setup(void)
 		CERROR("Failed to initialize LDLM pools: %d\n", rc);
 		GOTO(out, rc);
 	}
+
+	rc = ldlm_reclaim_setup();
+	if (rc) {
+		CERROR("Failed to setup reclaim thread: rc = %d\n", rc);
+		GOTO(out, rc);
+	}
 	RETURN(0);
 
  out:
@@ -2961,7 +2974,8 @@ static int ldlm_cleanup(void)
                 RETURN(-EBUSY);
         }
 
-        ldlm_pools_fini();
+	ldlm_reclaim_cleanup();
+	ldlm_pools_fini();
 
 	if (ldlm_state->ldlm_bl_pool != NULL) {
 		struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool;
diff --git a/lustre/ldlm/ldlm_pool.c b/lustre/ldlm/ldlm_pool.c
index ea1971f..f75af5c 100644
--- a/lustre/ldlm/ldlm_pool.c
+++ b/lustre/ldlm/ldlm_pool.c
@@ -912,10 +912,16 @@ void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock)
 	 * cancelled, instead special kind of lock is used to drop them.
 	 * also there is no LRU for flock locks, so no point in tracking
 	 * them anyway.
+	 *
+	 * PLAIN locks are used by config and quota, the quantity is small
+	 * and usually they are not in LRU.
 	 */
-	if (lock->l_resource->lr_type == LDLM_FLOCK)
+	if (lock->l_resource->lr_type == LDLM_FLOCK ||
+	    lock->l_resource->lr_type == LDLM_PLAIN)
 		return;
 
+	ldlm_reclaim_add(lock);
+
 	atomic_inc(&pl->pl_granted);
 	atomic_inc(&pl->pl_grant_rate);
 	lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_GRANT_STAT);
@@ -935,11 +941,15 @@ void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock)
 void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock)
 {
 	/*
-	 * Filter out FLOCK locks. Read above comment in ldlm_pool_add().
+	 * Filter out FLOCK & PLAIN locks. Read above comment in
+	 * ldlm_pool_add().
 	 */
-	if (lock->l_resource->lr_type == LDLM_FLOCK)
+	if (lock->l_resource->lr_type == LDLM_FLOCK ||
+	    lock->l_resource->lr_type == LDLM_PLAIN)
 		return;
 
+	ldlm_reclaim_del(lock);
+
 	LASSERT(atomic_read(&pl->pl_granted) > 0);
 	atomic_dec(&pl->pl_granted);
 	atomic_inc(&pl->pl_cancel_rate);
diff --git a/lustre/ldlm/ldlm_reclaim.c b/lustre/ldlm/ldlm_reclaim.c
new file mode 100644
index 0000000..01ec441
--- /dev/null
+++ b/lustre/ldlm/ldlm_reclaim.c
@@ -0,0 +1,368 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2015, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Niu    Yawei    <yawei.niu@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <linux/kthread.h>
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include "ldlm_internal.h"
+
+/*
+ * To avoid ldlm lock exhausting server memory, two global parameters:
+ * ldlm_watermark_low & ldlm_watermark_high are used for reclaiming
+ * granted locks and rejecting incoming enqueue requests defensively.
+ *
+ * ldlm_watermark_low: When the amount of granted locks reaching this
+ * threshold, server start to revoke locks gradually.
+ *
+ * ldlm_watermark_high: When the amount of granted locks reaching this
+ * threshold, server will return -EINPROGRESS to any incoming enqueue
+ * request until the lock count is shrunk below the threshold again.
+ *
+ * ldlm_watermark_low & ldlm_watermark_high is set to 20% & 30% of the
+ * total memory by default. It is tunable via proc entry, when it's set
+ * to 0, the feature is disabled.
+ */
+
+/*
+ * FIXME:
+ *
+ * In current implementation, server identifies which locks should be
+ * revoked by choosing locks from namespace/resource in a roundrobin
+ * manner, which isn't optimal. The ideal way should be server notifies
+ * clients to cancel locks voluntarily, because only client knows exactly
+ * when the lock is last used.
+ *
+ * However how to notify client immediately is a problem, one idea
+ * is to leverage the glimplse callbacks on some artificial global
+ * lock (like quota global lock does), but that requires protocol
+ * changes, let's fix it in future long-term solution.
+ */
+
+__u64 ldlm_watermark_low;
+__u64 ldlm_watermark_high;
+
+#ifdef HAVE_SERVER_SUPPORT
+
+static struct percpu_counter	ldlm_granted_total;
+static atomic_t			ldlm_nr_reclaimer;
+static cfs_duration_t		ldlm_last_reclaim_age;
+static cfs_time_t		ldlm_last_reclaim_time;
+
+struct ldlm_reclaim_cb_data {
+	struct list_head	 rcd_rpc_list;
+	int			 rcd_added;
+	int			 rcd_total;
+	int			 rcd_cursor;
+	int			 rcd_start;
+	bool			 rcd_skip;
+	cfs_duration_t		 rcd_age;
+	struct cfs_hash_bd	*rcd_prev_bd;
+};
+
+static inline bool ldlm_lock_reclaimable(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	/* FLOCK & PLAIN lock are not reclaimable. FLOCK is
+	 * explicitly controlled by application, PLAIN lock
+	 * is used by quota global lock and config lock.
+	 */
+	if (ns->ns_client == LDLM_NAMESPACE_SERVER &&
+	    (lock->l_resource->lr_type == LDLM_IBITS ||
+	     lock->l_resource->lr_type == LDLM_EXTENT))
+		return true;
+	return false;
+}
+
+static int ldlm_reclaim_lock_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+				struct hlist_node *hnode, void *arg)
+
+{
+	struct ldlm_resource		*res;
+	struct ldlm_reclaim_cb_data	*data;
+	struct ldlm_lock		*lock;
+	struct ldlm_ns_bucket		*nsb;
+	int				 rc = 0;
+
+	data = (struct ldlm_reclaim_cb_data *)arg;
+
+	LASSERTF(data->rcd_added < data->rcd_total, "added:%d >= total:%d\n",
+		 data->rcd_added, data->rcd_total);
+
+	nsb = cfs_hash_bd_extra_get(hs, bd);
+	res = cfs_hash_object(hs, hnode);
+
+	if (data->rcd_prev_bd != bd) {
+		if (data->rcd_prev_bd != NULL)
+			ldlm_res_to_ns(res)->ns_reclaim_start++;
+		data->rcd_prev_bd = bd;
+		data->rcd_cursor = 0;
+		data->rcd_start = nsb->nsb_reclaim_start %
+				  cfs_hash_bd_count_get(bd);
+	}
+
+	if (data->rcd_skip && data->rcd_cursor < data->rcd_start) {
+		data->rcd_cursor++;
+		return 0;
+	}
+
+	nsb->nsb_reclaim_start++;
+
+	lock_res(res);
+	list_for_each_entry(lock, &res->lr_granted, l_res_link) {
+		if (!ldlm_lock_reclaimable(lock))
+			continue;
+
+		if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_LOW) &&
+		    cfs_time_before(cfs_time_current(),
+				    cfs_time_add(lock->l_last_used,
+						 data->rcd_age)))
+			continue;
+
+		if (!ldlm_is_ast_sent(lock)) {
+			ldlm_set_ast_sent(lock);
+			LASSERT(list_empty(&lock->l_rk_ast));
+			list_add(&lock->l_rk_ast, &data->rcd_rpc_list);
+			LDLM_LOCK_GET(lock);
+			if (++data->rcd_added == data->rcd_total) {
+				rc = 1; /* stop the iteration */
+				break;
+			}
+		}
+	}
+	unlock_res(res);
+
+	return rc;
+}
+
+static void ldlm_reclaim_res(struct ldlm_namespace *ns, int *count,
+			     cfs_duration_t age, bool skip)
+{
+	struct ldlm_reclaim_cb_data	data;
+	int				idx, type, start;
+	ENTRY;
+
+	LASSERT(*count != 0);
+
+	if (ns->ns_obd) {
+		type = server_name2index(ns->ns_obd->obd_name, &idx, NULL);
+		if (type != LDD_F_SV_TYPE_MDT && type != LDD_F_SV_TYPE_OST) {
+			EXIT;
+			return;
+		}
+	}
+
+	if (atomic_read(&ns->ns_bref) == 0) {
+		EXIT;
+		return;
+	}
+
+	INIT_LIST_HEAD(&data.rcd_rpc_list);
+	data.rcd_added = 0;
+	data.rcd_total = *count;
+	data.rcd_age = age;
+	data.rcd_skip = skip;
+	data.rcd_prev_bd = NULL;
+	start = ns->ns_reclaim_start % CFS_HASH_NBKT(ns->ns_rs_hash);
+
+	cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_reclaim_lock_cb, &data,
+				 start);
+
+	CDEBUG(D_DLMTRACE, "NS(%s): %d locks to be reclaimed, found %d/%d "
+	       "locks.\n", ldlm_ns_name(ns), *count, data.rcd_added,
+	       data.rcd_total);
+
+	LASSERTF(*count >= data.rcd_added, "count:%d, added:%d\n", *count,
+		 data.rcd_added);
+
+	ldlm_run_ast_work(ns, &data.rcd_rpc_list, LDLM_WORK_REVOKE_AST);
+	*count -= data.rcd_added;
+	EXIT;
+}
+
+#define LDLM_RECLAIM_BATCH	512
+#define LDLM_RECLAIM_AGE_MIN	cfs_time_seconds(300)
+#define LDLM_RECLAIM_AGE_MAX	(LDLM_DEFAULT_MAX_ALIVE * 3 / 4)
+
+static inline cfs_duration_t ldlm_reclaim_age(void)
+{
+	cfs_duration_t	age;
+
+	age = ldlm_last_reclaim_age +
+		cfs_time_sub(cfs_time_current(), ldlm_last_reclaim_time);
+	if (age > LDLM_RECLAIM_AGE_MAX)
+		age = LDLM_RECLAIM_AGE_MAX;
+	else if (age < (LDLM_RECLAIM_AGE_MIN * 2))
+		age = LDLM_RECLAIM_AGE_MIN;
+	return age;
+}
+
+static void ldlm_reclaim_ns(void)
+{
+	struct ldlm_namespace	*ns;
+	int			 count = LDLM_RECLAIM_BATCH;
+	int			 ns_nr, nr_processed;
+	ldlm_side_t		 ns_cli = LDLM_NAMESPACE_SERVER;
+	cfs_duration_t		 age;
+	bool			 skip = true;
+	ENTRY;
+
+	if (!atomic_add_unless(&ldlm_nr_reclaimer, 1, 1)) {
+		EXIT;
+		return;
+	}
+
+	age = ldlm_reclaim_age();
+again:
+	nr_processed = 0;
+	ns_nr = ldlm_namespace_nr_read(ns_cli);
+	while (count > 0 && nr_processed < ns_nr) {
+		mutex_lock(ldlm_namespace_lock(ns_cli));
+
+		if (list_empty(ldlm_namespace_list(ns_cli))) {
+			mutex_unlock(ldlm_namespace_lock(ns_cli));
+			goto out;
+		}
+
+		ns = ldlm_namespace_first_locked(ns_cli);
+		ldlm_namespace_move_to_active_locked(ns, ns_cli);
+		mutex_unlock(ldlm_namespace_lock(ns_cli));
+
+		ldlm_reclaim_res(ns, &count, age, skip);
+		ldlm_namespace_put(ns);
+		nr_processed++;
+	}
+
+	if (count > 0 && age > LDLM_RECLAIM_AGE_MIN) {
+		age >>= 1;
+		if (age < (LDLM_RECLAIM_AGE_MIN * 2))
+			age = LDLM_RECLAIM_AGE_MIN;
+		skip = false;
+		goto again;
+	}
+
+	ldlm_last_reclaim_age = age;
+	ldlm_last_reclaim_time = cfs_time_current();
+out:
+	atomic_add_unless(&ldlm_nr_reclaimer, -1, 0);
+	EXIT;
+}
+
+void ldlm_reclaim_add(struct ldlm_lock *lock)
+{
+	if (!ldlm_lock_reclaimable(lock))
+		return;
+	percpu_counter_add(&ldlm_granted_total, 1);
+	lock->l_last_used = cfs_time_current();
+}
+
+void ldlm_reclaim_del(struct ldlm_lock *lock)
+{
+	if (!ldlm_lock_reclaimable(lock))
+		return;
+	percpu_counter_sub(&ldlm_granted_total, 1);
+}
+
+bool ldlm_reclaim_full(void)
+{
+	__u64 high = ldlm_watermark_high;
+	__u64 low = ldlm_watermark_low;
+
+	if (low != 0 && OBD_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_LOW))
+		low = cfs_fail_val;
+
+	if (low != 0 &&
+	    percpu_counter_read_positive(&ldlm_granted_total) > low)
+		ldlm_reclaim_ns();
+
+	if (high != 0 && OBD_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_HIGH))
+		high = cfs_fail_val;
+
+	if (high != 0 &&
+	    percpu_counter_read_positive(&ldlm_granted_total) > high)
+		return true;
+
+	return false;
+}
+
+static inline __u64 ldlm_ratio2locknr(int ratio)
+{
+	__u64 locknr;
+
+	locknr = ((__u64)NUM_CACHEPAGES << PAGE_CACHE_SHIFT) * ratio;
+	do_div(locknr, 100 * sizeof(struct ldlm_lock));
+
+	return locknr;
+}
+
+#define LDLM_WM_RATIO_LOW_DEFAULT	20
+#define LDLM_WM_RATIO_HIGH_DEFAULT	30
+
+int ldlm_reclaim_setup(void)
+{
+	atomic_set(&ldlm_nr_reclaimer, 0);
+	ldlm_watermark_low = ldlm_ratio2locknr(LDLM_WM_RATIO_LOW_DEFAULT);
+	ldlm_watermark_high = ldlm_ratio2locknr(LDLM_WM_RATIO_HIGH_DEFAULT);
+	ldlm_last_reclaim_age = LDLM_RECLAIM_AGE_MAX;
+	ldlm_last_reclaim_time = cfs_time_current();
+
+	return percpu_counter_init(&ldlm_granted_total, 0);
+}
+
+void ldlm_reclaim_cleanup(void)
+{
+	percpu_counter_destroy(&ldlm_granted_total);
+}
+
+#else /* HAVE_SERVER_SUPPORT */
+
+bool ldlm_reclaim_full(void)
+{
+	return false;
+}
+
+void ldlm_reclaim_add(struct ldlm_lock *lock)
+{
+}
+
+void ldlm_reclaim_del(struct ldlm_lock *lock)
+{
+}
+
+int ldlm_reclaim_setup(void)
+{
+	return 0;
+}
+
+void ldlm_reclaim_cleanup(void)
+{
+}
+
+#endif /* HAVE_SERVER_SUPPORT */
diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c
index d3049c0..7b90e49 100644
--- a/lustre/ldlm/ldlm_request.c
+++ b/lustre/ldlm/ldlm_request.c
@@ -2014,11 +2014,11 @@ int ldlm_cli_cancel_unused(struct ldlm_namespace *ns,
                 RETURN(ldlm_cli_cancel_unused_resource(ns, res_id, NULL,
                                                        LCK_MINMODE, flags,
                                                        opaque));
-        } else {
-                cfs_hash_for_each_nolock(ns->ns_rs_hash,
-                                         ldlm_cli_hash_cancel_unused, &arg);
-                RETURN(ELDLM_OK);
-        }
+	} else {
+		cfs_hash_for_each_nolock(ns->ns_rs_hash,
+					 ldlm_cli_hash_cancel_unused, &arg, 0);
+		RETURN(ELDLM_OK);
+	}
 }
 
 /* Lock iterators. */
@@ -2088,8 +2088,8 @@ void ldlm_namespace_foreach(struct ldlm_namespace *ns,
 {
 	struct iter_helper_data helper = { .iter = iter, .closure = closure };
 
-        cfs_hash_for_each_nolock(ns->ns_rs_hash,
-                                 ldlm_res_iter_helper, &helper);
+	cfs_hash_for_each_nolock(ns->ns_rs_hash,
+				 ldlm_res_iter_helper, &helper, 0);
 
 }
 
diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c
index fd7b65d..084ad80 100644
--- a/lustre/ldlm/ldlm_resource.c
+++ b/lustre/ldlm/ldlm_resource.c
@@ -84,6 +84,55 @@ LPROC_SEQ_FOPS_WO_TYPE(ldlm, dump_ns);
 LPROC_SEQ_FOPS_RW_TYPE(ldlm_rw, uint);
 LPROC_SEQ_FOPS_RO_TYPE(ldlm, uint);
 
+/* Lock count is stored in the watermark, and it's display as number of MB
+ * memory consumed by the locks */
+static int seq_watermark_show(struct seq_file *m, void *data)
+{
+	__u64 locknr = *(__u64 *)m->private;
+	return seq_printf(m, LPU64"\n",
+			  (locknr * sizeof(struct ldlm_lock)) >> 20);
+}
+
+static ssize_t seq_watermark_write(struct file *file,
+				   const char __user *buffer, size_t count,
+				   loff_t *off)
+{
+	__u64 watermark;
+	__u64 *data = ((struct seq_file *)file->private_data)->private;
+	int rc;
+
+	rc = lprocfs_write_frac_u64_helper(buffer, count, &watermark, 1 << 20);
+	if (rc) {
+		CERROR("Failed to set LDLM watermark, rc = %d.\n", rc);
+		return rc;
+	} else if (watermark != 0 && watermark < (1 << 20)) {
+		CERROR("Watermark should be greater than 1MB.\n");
+		return -EINVAL;
+	}
+
+	do_div(watermark, sizeof(struct ldlm_lock));
+	*data = watermark;
+
+	if (ldlm_watermark_low != 0 && ldlm_watermark_high != 0 &&
+	    ldlm_watermark_low > ldlm_watermark_high)
+		ldlm_watermark_low = ldlm_watermark_high;
+	return count;
+}
+
+static int seq_watermark_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, seq_watermark_show, PDE_DATA(inode));
+}
+
+static const struct file_operations ldlm_watermark_fops = {
+	.owner		= THIS_MODULE,
+	.open		= seq_watermark_open,
+	.read		= seq_read,
+	.write		= seq_watermark_write,
+	.llseek		= seq_lseek,
+	.release	= lprocfs_single_release,
+};
+
 int ldlm_proc_setup(void)
 {
 	int rc;
@@ -97,6 +146,12 @@ int ldlm_proc_setup(void)
 		{ .name	=	"cancel_unused_locks_before_replay",
 		  .fops	=	&ldlm_rw_uint_fops,
 		  .data	=	&ldlm_cancel_unused_locks_before_replay },
+		{ .name =	"watermark_mb_low",
+		  .fops =	&ldlm_watermark_fops,
+		  .data =	&ldlm_watermark_low },
+		{ .name =	"watermark_mb_high",
+		  .fops =	&ldlm_watermark_fops,
+		  .data =	&ldlm_watermark_high },
 		{ NULL }};
 	ENTRY;
 	LASSERT(ldlm_ns_proc_dir == NULL);
@@ -613,6 +668,7 @@ struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
                 nsb = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd);
                 at_init(&nsb->nsb_at_estimate, ldlm_enqueue_min, 0);
                 nsb->nsb_namespace = ns;
+		nsb->nsb_reclaim_start = 0;
         }
 
         ns->ns_obd      = obd;
@@ -638,6 +694,7 @@ struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
         ns->ns_orig_connect_flags = 0;
         ns->ns_connect_flags      = 0;
         ns->ns_stopping           = 0;
+	ns->ns_reclaim_start	  = 0;
         rc = ldlm_namespace_proc_register(ns);
         if (rc != 0) {
                 CERROR("Can't initialize ns proc, rc %d\n", rc);
@@ -797,9 +854,11 @@ int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags)
                 return ELDLM_OK;
         }
 
-	cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_clean, &flags);
-        cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_complain, NULL);
-        return ELDLM_OK;
+	cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_clean,
+				 &flags, 0);
+	cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_complain,
+				 NULL, 0);
+	return ELDLM_OK;
 }
 EXPORT_SYMBOL(ldlm_namespace_cleanup);
 
@@ -1365,7 +1424,7 @@ void ldlm_namespace_dump(int level, struct ldlm_namespace *ns)
 
 	cfs_hash_for_each_nolock(ns->ns_rs_hash,
 				 ldlm_res_hash_dump,
-				 (void *)(unsigned long)level);
+				 (void *)(unsigned long)level, 0);
 	spin_lock(&ns->ns_lock);
 	ns->ns_next_dump = cfs_time_shift(10);
 	spin_unlock(&ns->ns_lock);
diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c
index 9a17940..9ebef67 100644
--- a/lustre/mdc/mdc_locks.c
+++ b/lustre/mdc/mdc_locks.c
@@ -817,11 +817,6 @@ resend:
         if (IS_ERR(req))
                 RETURN(PTR_ERR(req));
 
-	if (req != NULL && it && it->it_op & IT_CREAT)
-		/* ask ptlrpc not to resend on EINPROGRESS since we have our own
-		 * retry logic */
-		req->rq_no_retry_einprogress = 1;
-
         if (resends) {
                 req->rq_generation_set = 1;
                 req->rq_import_generation = generation;
@@ -879,10 +874,10 @@ resend:
 	lockrep->lock_policy_res2 =
 		ptlrpc_status_ntoh(lockrep->lock_policy_res2);
 
-        /* Retry the create infinitely when we get -EINPROGRESS from
-         * server. This is required by the new quota design. */
-        if (it && it->it_op & IT_CREAT &&
-            (int)lockrep->lock_policy_res2 == -EINPROGRESS) {
+	/* Retry infinitely when the server returns -EINPROGRESS for the
+	 * intent operation, when server returns -EINPROGRESS for acquiring
+	 * intent lock, we'll retry in after_reply(). */
+	if (it && (int)lockrep->lock_policy_res2 == -EINPROGRESS) {
                 mdc_clear_replay_flag(req, rc);
                 ptlrpc_req_finished(req);
                 resends++;
diff --git a/lustre/ofd/ofd_dev.c b/lustre/ofd/ofd_dev.c
index c4b3c9b..631987e 100644
--- a/lustre/ofd/ofd_dev.c
+++ b/lustre/ofd/ofd_dev.c
@@ -2269,6 +2269,7 @@ static int ofd_prolong_extent_locks(struct tgt_session_info *tsi,
 				LDLM_LOCK_PUT(lock);
 				RETURN(lock_count);
 			}
+			lock->l_last_used = cfs_time_current();
 			LDLM_LOCK_PUT(lock);
 		}
 	}
diff --git a/lustre/ptlrpc/Makefile.in b/lustre/ptlrpc/Makefile.in
index 37c86cf..7521a98 100644
--- a/lustre/ptlrpc/Makefile.in
+++ b/lustre/ptlrpc/Makefile.in
@@ -8,6 +8,7 @@ ldlm_objs += $(LDLM)ldlm_plain.o $(LDLM)ldlm_extent.o
 ldlm_objs += $(LDLM)ldlm_request.o $(LDLM)ldlm_lockd.o
 ldlm_objs += $(LDLM)ldlm_flock.o $(LDLM)ldlm_inodebits.o
 ldlm_objs += $(LDLM)ldlm_pool.o $(LDLM)interval_tree.o
+ldlm_objs += $(LDLM)ldlm_reclaim.o
 
 target_objs := $(TARGET)tgt_main.o $(TARGET)tgt_lastrcvd.o
 target_objs += $(TARGET)tgt_handler.o $(TARGET)out_handler.o
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh
index bbf65c3..42562f5 100644
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -9374,6 +9374,79 @@ test_133g() {
 }
 run_test 133g "Check for Oopses on bad io area writes/reads in /proc"
 
+test_134a() {
+	[[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.54) ]] &&
+		skip "Need MDS version at least 2.7.54" && return
+
+	mkdir -p $DIR/$tdir || error "failed to create $DIR/$tdir"
+	cancel_lru_locks mdc
+
+	local nsdir="ldlm.namespaces.*-MDT0000-mdc-*"
+	local unused=$($LCTL get_param -n $nsdir.lock_unused_count)
+	[ $unused -eq 0 ] || "$unused locks are not cleared"
+
+	local nr=1000
+	createmany -o $DIR/$tdir/f $nr ||
+		error "failed to create $nr files in $DIR/$tdir"
+	unused=$($LCTL get_param -n $nsdir.lock_unused_count)
+
+	#define OBD_FAIL_LDLM_WATERMARK_LOW     0x327
+	do_facet mds1 $LCTL set_param fail_loc=0x327
+	do_facet mds1 $LCTL set_param fail_val=500
+	touch $DIR/$tdir/m
+
+	echo "sleep 10 seconds ..."
+	sleep 10
+	local lck_cnt=$($LCTL get_param -n $nsdir.lock_unused_count)
+
+	do_facet mds1 $LCTL set_param fail_loc=0
+	do_facet mds1 $LCTL set_param fail_val=0
+	[ $lck_cnt -lt $unused ] ||
+		error "No locks reclaimed, before:$unused, after:$lck_cnt"
+
+	rm $DIR/$tdir/m
+	unlinkmany $DIR/$tdir/f $nr
+}
+run_test 134a "Server reclaims locks when reaching low watermark"
+
+test_134b() {
+	[[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.54) ]] &&
+		skip "Need MDS version at least 2.7.54" && return
+
+	mkdir -p $DIR/$tdir || error "failed to create $DIR/$tdir"
+	cancel_lru_locks mdc
+
+	local low_wm=$(do_facet mds1 $LCTL get_param -n ldlm.watermark_mb_low)
+	# disable reclaim temporarily
+	do_facet mds1 $LCTL set_param ldlm.watermark_mb_low=0
+
+	#define OBD_FAIL_LDLM_WATERMARK_HIGH     0x328
+	do_facet mds1 $LCTL set_param fail_loc=0x328
+	do_facet mds1 $LCTL set_param fail_val=500
+
+	$LCTL set_param debug=+trace
+
+	local nr=600
+	createmany -o $DIR/$tdir/f $nr &
+	local create_pid=$!
+
+	echo "Sleep $TIMEOUT seconds ..."
+	sleep $TIMEOUT
+	if ! ps -p $create_pid  > /dev/null 2>&1; then
+		do_facet mds1 $LCTL set_param fail_loc=0
+		do_facet mds1 $LCTL set_param fail_val=0
+		do_facet mds1 $LCTL set_param ldlm.watermark_mb_low=$low_wm
+		error "createmany finished incorrectly!"
+	fi
+	do_facet mds1 $LCTL set_param fail_loc=0
+	do_facet mds1 $LCTL set_param fail_val=0
+	do_facet mds1 $LCTL set_param ldlm.watermark_mb_low=$low_wm
+	wait $create_pid || return 1
+
+	unlinkmany $DIR/$tdir/f $nr
+}
+run_test 134b "Server rejects lock request when reaching high watermark"
+
 test_140() { #bug-17379
 	[ $PARALLEL == "yes" ] && skip "skip parallel run" && return
         test_mkdir -p $DIR/$tdir || error "Creating dir $DIR/$tdir"
-- 
1.8.3.1