From d5c583291498d26f5f5634b8f3463bbfe7109f1e Mon Sep 17 00:00:00 2001
From: Patrick Farrell <paf@cray.com>
Date: Mon, 14 Aug 2017 05:09:35 -0500
Subject: [PATCH] LU-8276 ldlm: Make lru clear always discard read lock pages

A significant amount of time is sometimes spent during
lru clearing (IE, echo 'clear' > lru_size) checking
pages to see if they are covered by another read lock.
Since all unused read locks will be destroyed by this
operation, the pages will be freed momentarily anyway,
and this is a waste of time.

This patch sets the LDLM_FL_DISCARD_DATA flag on all the PR
locks which are slated for cancellation by
ldlm_prepare_lru_list when it is called from
ldlm_ns_drop_cache.

The case where another lock covers those pages (and is in
use and so does not get cancelled by lru clear) is safe for
a few reasons:

1. When discarding pages, we wait (discard_cb->cl_page_own)
until they are in the cached state before invalidating.
So if they are actively in use, we'll wait until that use
is done.

2. Removal of pages under a read lock is something that can
happen due to memory pressure, since these are VFS cache
pages. If a client reads something which is then removed
from the cache and goes to read it again, this will simply
generate a new read request.

This has a performance cost for that reader, but if anyone
is clearing the ldlm lru while actively doing I/O in that
namespace, then they cannot expect good performance.

In the case of many read locks on a single resource, this
improves cleanup time dramatically.  In internal testing at
Cray with ~80,000 read locks on a single file, this improves
cleanup time from ~60 seconds to ~0.5 seconds.  This also
slightly improves cleanup speed in the case of 1 or a few
read locks on a file.

Lustre-change: https://review.whamcloud.com/20785
Lustre-commit: 6a369b59f3729513dd8e81c4964dc6183287b601

Signed-off-by: Patrick Farrell <paf@cray.com>
Change-Id: I0c076b31ea474bb5f012373ed2033de3e447b62d
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Jinshan Xiong <jinshan.xiong@intel.com>
Signed-off-by: Minh Diep <minh.diep@intel.com>
Reviewed-on: https://review.whamcloud.com/29264
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: John L. Hammond <john.hammond@intel.com>
---
 lustre/include/lustre_dlm_flags.h | 2 +-
 lustre/ldlm/ldlm_internal.h       | 5 +++++
 lustre/ldlm/ldlm_request.c        | 9 +++++++++
 lustre/ldlm/ldlm_resource.c       | 6 ++++--
 lustre/osc/osc_cache.c            | 4 ++--
 lustre/osc/osc_cl_internal.h      | 2 +-
 lustre/osc/osc_lock.c             | 8 ++++----
 lustre/osc/osc_object.c           | 2 +-
 8 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/lustre/include/lustre_dlm_flags.h b/lustre/include/lustre_dlm_flags.h
index 9b7037e..179cb71 100644
--- a/lustre/include/lustre_dlm_flags.h
+++ b/lustre/include/lustre_dlm_flags.h
@@ -90,7 +90,7 @@
 #define ldlm_set_flock_deadlock(_l)     LDLM_SET_FLAG((  _l), 1ULL << 15)
 #define ldlm_clear_flock_deadlock(_l)   LDLM_CLEAR_FLAG((_l), 1ULL << 15)
 
-/** discard (no writeback) on cancel */
+/** discard (no writeback (PW locks) or page retention (PR locks)) on cancel */
 #define LDLM_FL_DISCARD_DATA            0x0000000000010000ULL // bit  16
 #define ldlm_is_discard_data(_l)        LDLM_TEST_FLAG(( _l), 1ULL << 16)
 #define ldlm_set_discard_data(_l)       LDLM_SET_FLAG((  _l), 1ULL << 16)
diff --git a/lustre/ldlm/ldlm_internal.h b/lustre/ldlm/ldlm_internal.h
index b7764dc..8ef4709 100644
--- a/lustre/ldlm/ldlm_internal.h
+++ b/lustre/ldlm/ldlm_internal.h
@@ -103,6 +103,11 @@ enum ldlm_lru_flags {
 	LDLM_LRU_FLAG_LRUR	= 0x08, /* Cancel locks from lru resize */
 	LDLM_LRU_FLAG_NO_WAIT	= 0x10, /* Cancel locks w/o blocking (neither
 					 * sending nor waiting for any RPCs) */
+	LDLM_LRU_FLAG_CLEANUP	= 0x20, /* Used when clearing lru, tells
+					 * prepare_lru_list to set discard flag
+					 * on PR extent locks so we don't waste
+					 * time saving pages that will be
+					 * discarded momentarily */
 };
 
 int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr,
diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c
index 8432ade..0d8eeae 100644
--- a/lustre/ldlm/ldlm_request.c
+++ b/lustre/ldlm/ldlm_request.c
@@ -1693,6 +1693,10 @@ ldlm_cancel_lru_policy(struct ldlm_namespace *ns, enum ldlm_lru_flags lru_flags)
  *				(typically before replaying locks) w/o
  *				sending any RPCs or waiting for any
  *				outstanding RPC to complete.
+ *
+ * flags & LDLM_CANCEL_CLEANUP - when cancelling read locks, do not check for
+ * 				other read locks covering the same pages, just
+ * 				discard those pages.
  */
 static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
 				 struct list_head *cancels, int count, int max,
@@ -1813,6 +1817,11 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
 		 * this flag and call l_blocking_ast */
 		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING;
 
+		if ((lru_flags & LDLM_LRU_FLAG_CLEANUP) &&
+		    lock->l_resource->lr_type == LDLM_EXTENT &&
+		    lock->l_granted_mode == LCK_PR)
+			ldlm_set_discard_data(lock);
+
 		/* We can't re-add to l_lru as it confuses the
 		 * refcounting in ldlm_lock_remove_from_lru() if an AST
 		 * arrives after we drop lr_lock below. We use l_bl_ast
diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c
index 1257d85..bf03a3a 100644
--- a/lustre/ldlm/ldlm_resource.c
+++ b/lustre/ldlm/ldlm_resource.c
@@ -331,7 +331,8 @@ static ssize_t lru_size_store(struct kobject *kobj, struct attribute *attr,
 
 			/* Try to cancel all @ns_nr_unused locks. */
 			canceled = ldlm_cancel_lru(ns, unused, 0,
-						   LDLM_LRU_FLAG_PASSED);
+						   LDLM_LRU_FLAG_PASSED |
+						   LDLM_LRU_FLAG_CLEANUP);
 			if (canceled < unused) {
 				CDEBUG(D_DLMTRACE,
 				       "not all requested locks are canceled, requested: %d, canceled: %d\n",
@@ -342,7 +343,8 @@ static ssize_t lru_size_store(struct kobject *kobj, struct attribute *attr,
 		} else {
 			tmp = ns->ns_max_unused;
 			ns->ns_max_unused = 0;
-			ldlm_cancel_lru(ns, 0, 0, LDLM_LRU_FLAG_PASSED);
+			ldlm_cancel_lru(ns, 0, 0, LDLM_LRU_FLAG_PASSED |
+					LDLM_LRU_FLAG_CLEANUP);
 			ns->ns_max_unused = tmp;
 		}
 		return count;
diff --git a/lustre/osc/osc_cache.c b/lustre/osc/osc_cache.c
index 3e9e29e..06da0c5 100644
--- a/lustre/osc/osc_cache.c
+++ b/lustre/osc/osc_cache.c
@@ -3287,7 +3287,7 @@ static int discard_cb(const struct lu_env *env, struct cl_io *io,
  * behind this being that lock cancellation cannot be delayed indefinitely).
  */
 int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
-			   pgoff_t start, pgoff_t end, enum cl_lock_mode mode)
+			   pgoff_t start, pgoff_t end, bool discard)
 {
 	struct osc_thread_info *info = osc_env_info(env);
 	struct cl_io *io = &info->oti_io;
@@ -3303,7 +3303,7 @@ int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
 	if (result != 0)
 		GOTO(out, result);
 
-	cb = mode == CLM_READ ? check_and_discard_cb : discard_cb;
+	cb = discard ? discard_cb : check_and_discard_cb;
 	info->oti_fn_index = info->oti_next_index = start;
 	do {
 		res = osc_page_gang_lookup(env, io, osc,
diff --git a/lustre/osc/osc_cl_internal.h b/lustre/osc/osc_cl_internal.h
index f9b9faf..99a5098 100644
--- a/lustre/osc/osc_cl_internal.h
+++ b/lustre/osc/osc_cl_internal.h
@@ -663,7 +663,7 @@ int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
 int osc_extent_release(const struct lu_env *env, struct osc_extent *ext);
 
 int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
-			   pgoff_t start, pgoff_t end, enum cl_lock_mode mode);
+			   pgoff_t start, pgoff_t end, bool discard_pages);
 
 typedef int (*osc_page_gang_cbt)(const struct lu_env *, struct cl_io *,
 				 struct osc_page *, void *);
diff --git a/lustre/osc/osc_lock.c b/lustre/osc/osc_lock.c
index fda349b..6d53b5b 100644
--- a/lustre/osc/osc_lock.c
+++ b/lustre/osc/osc_lock.c
@@ -386,7 +386,7 @@ out:
 }
 
 static int osc_lock_flush(struct osc_object *obj, pgoff_t start, pgoff_t end,
-			  enum cl_lock_mode mode, int discard)
+			  enum cl_lock_mode mode, bool discard)
 {
 	struct lu_env		*env;
 	__u16			refcheck;
@@ -409,7 +409,7 @@ static int osc_lock_flush(struct osc_object *obj, pgoff_t start, pgoff_t end,
 			rc = 0;
 	}
 
-	rc2 = osc_lock_discard_pages(env, obj, start, end, mode);
+	rc2 = osc_lock_discard_pages(env, obj, start, end, discard);
 	if (rc == 0 && rc2 < 0)
 		rc = rc2;
 
@@ -427,7 +427,7 @@ static int osc_dlm_blocking_ast0(const struct lu_env *env,
 {
 	struct cl_object	*obj = NULL;
 	int			result = 0;
-	int			discard;
+	bool			discard;
 	enum cl_lock_mode	mode = CLM_READ;
 	ENTRY;
 
@@ -1114,7 +1114,7 @@ static void osc_lock_lockless_cancel(const struct lu_env *env,
 
 	LASSERT(ols->ols_dlmlock == NULL);
 	result = osc_lock_flush(osc, descr->cld_start, descr->cld_end,
-				descr->cld_mode, 0);
+				descr->cld_mode, false);
         if (result)
                 CERROR("Pages for lockless lock %p were not purged(%d)\n",
                        ols, result);
diff --git a/lustre/osc/osc_object.c b/lustre/osc/osc_object.c
index 7c486dd..052f8bc 100644
--- a/lustre/osc/osc_object.c
+++ b/lustre/osc/osc_object.c
@@ -471,7 +471,7 @@ int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc)
 	osc_cache_truncate_start(env, osc, 0, NULL);
 
 	/* Discard all caching pages */
-	osc_lock_discard_pages(env, osc, 0, CL_PAGE_EOF, CLM_WRITE);
+	osc_lock_discard_pages(env, osc, 0, CL_PAGE_EOF, true);
 
 	/* Clear ast data of dlm lock. Do this after discarding all pages */
 	osc_object_prune(env, osc2cl(osc));
-- 
1.8.3.1