From: Patrick Farrell <pfarrell@whamcloud.com>
Date: Mon, 9 Sep 2019 15:56:07 +0000 (-0400)
Subject: LU-11670 osc: glimpse - search for active lock
X-Git-Tag: 2.12.90~128
X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=b3461d11dcb04670cc2e1bfbb99306cfd3f645ef

LU-11670 osc: glimpse - search for active lock

When there are lock-ahead write locks on a file, the server
sends one glimpse AST RPC to each client having such (it
may have many) locks. This callback is sent to the lock
having the highest offset.

Client's glimpse callback goes up to the clio layers and
gets the global (not lock-specific) view of size.  The clio
layers are connected to the extent lock through the
l_ast_data (which points to the OSC object).

Speculative locks (AGL, lockahead) do not have l_ast_data
initialised until an IO happens under the lock. Thus, some
speculative locks may not have l_ast_data initialized.

It is possible for the client to do a write using one lock
(changing file size), but for the glimpse AST to be sent to
another lock without l_ast_data initialized.  Currently, a
lock with no l_ast_data set returns ELDLM_NO_LOCK_DATA to
the server.  In this case, this means we do not return the
updated size.

The solution is to search the granted lock tree for any lock
with initialized l_ast_data (it points to the OSC object
which is the same for all the extent locks) and to reach the
clio layers for the size through this lock instead.

cray-bug-id: LUS-6747
Signed-off-by: Patrick Farrell <pfarrell@whamcloud.com>
Change-Id: I6c60f4133154a3d6652315f155af24bbc5752dd2
Reviewed-on: https://review.whamcloud.com/33660
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Bobi Jam <bobijam@hotmail.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
---

diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h
index cb265ae..6f6cf85 100644
--- a/lustre/include/lustre_dlm.h
+++ b/lustre/include/lustre_dlm.h
@@ -965,6 +965,20 @@ struct ldlm_lock {
 	struct list_head	l_exp_list;
 };
 
+/**
+ * Describe the overlap between two locks.  itree_overlap_cb data.
+ */
+struct ldlm_match_data {
+	struct ldlm_lock	*lmd_old;
+	struct ldlm_lock	*lmd_lock;
+	enum ldlm_mode		*lmd_mode;
+	union ldlm_policy_data	*lmd_policy;
+	__u64			 lmd_flags;
+	__u64			 lmd_skip_flags;
+	int			 lmd_unref;
+	bool			 lmd_has_ast_data;
+};
+
 /** For uncommitted cross-MDT lock, store transno this lock belongs to */
 #define l_transno l_client_cookie
 
@@ -1543,7 +1557,8 @@ static inline enum ldlm_mode ldlm_lock_match(struct ldlm_namespace *ns,
 	return ldlm_lock_match_with_skip(ns, flags, 0, res_id, type, policy,
 					 mode, lh, unref);
 }
-
+struct ldlm_lock *search_itree(struct ldlm_resource *res,
+			       struct ldlm_match_data *data);
 enum ldlm_mode ldlm_revalidate_lock_handle(const struct lustre_handle *lockh,
 					   __u64 *bits);
 void ldlm_lock_mode_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode);
diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h
index 6b73b9a..6ef38a0 100644
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -419,6 +419,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OSC_DELAY_SETTIME	 0x412
 #define OBD_FAIL_OSC_CONNECT_GRANT_PARAM 0x413
 #define OBD_FAIL_OSC_DELAY_IO            0x414
+#define OBD_FAIL_OSC_NO_SIZE_DATA        0x415
 
 #define OBD_FAIL_PTLRPC                  0x500
 #define OBD_FAIL_PTLRPC_ACK              0x501
diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c
index 701a772..5baae7b 100644
--- a/lustre/ldlm/ldlm_lock.c
+++ b/lustre/ldlm/ldlm_lock.c
@@ -1155,29 +1155,16 @@ void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list)
 }
 
 /**
- * Describe the overlap between two locks.  itree_overlap_cb data.
- */
-struct lock_match_data {
-	struct ldlm_lock	*lmd_old;
-	struct ldlm_lock	*lmd_lock;
-	enum ldlm_mode		*lmd_mode;
-	union ldlm_policy_data	*lmd_policy;
-	__u64			 lmd_flags;
-	__u64			 lmd_skip_flags;
-	int			 lmd_unref;
-};
-
-/**
  * Check if the given @lock meets the criteria for a match.
  * A reference on the lock is taken if matched.
  *
  * \param lock     test-against this lock
  * \param data	   parameters
  */
-static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
+static int lock_matches(struct ldlm_lock *lock, struct ldlm_match_data *data)
 {
 	union ldlm_policy_data *lpol = &lock->l_policy_data;
-	enum ldlm_mode match;
+	enum ldlm_mode match = LCK_MINMODE;
 
 	if (lock == data->lmd_old)
 		return INTERVAL_ITER_STOP;
@@ -1202,6 +1189,17 @@ static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
 
 	if (!(lock->l_req_mode & *data->lmd_mode))
 		return INTERVAL_ITER_CONT;
+
+	/* When we search for ast_data, we are not doing a traditional match,
+	 * so we don't worry about IBITS or extent matching.
+	 */
+	if (data->lmd_has_ast_data) {
+		if (!lock->l_ast_data)
+			return INTERVAL_ITER_CONT;
+
+		goto matched;
+	}
+
 	match = lock->l_req_mode;
 
 	switch (lock->l_resource->lr_type) {
@@ -1239,6 +1237,7 @@ static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
 	if (data->lmd_skip_flags & lock->l_flags)
 		return INTERVAL_ITER_CONT;
 
+matched:
 	if (data->lmd_flags & LDLM_FL_TEST_LOCK) {
 		LDLM_LOCK_GET(lock);
 		ldlm_lock_touch_in_lru(lock);
@@ -1255,7 +1254,7 @@ static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
 static unsigned int itree_overlap_cb(struct interval_node *in, void *args)
 {
 	struct ldlm_interval *node = to_ldlm_interval(in);
-	struct lock_match_data *data = args;
+	struct ldlm_match_data *data = args;
 	struct ldlm_lock *lock;
 	int rc;
 
@@ -1275,8 +1274,8 @@ static unsigned int itree_overlap_cb(struct interval_node *in, void *args)
  *
  * \retval a referenced lock or NULL.
  */
-static struct ldlm_lock *search_itree(struct ldlm_resource *res,
-				      struct lock_match_data *data)
+struct ldlm_lock *search_itree(struct ldlm_resource *res,
+			       struct ldlm_match_data *data)
 {
 	struct interval_node_extent ext = {
 		.start     = data->lmd_policy->l_extent.start,
@@ -1303,6 +1302,7 @@ static struct ldlm_lock *search_itree(struct ldlm_resource *res,
 
 	return NULL;
 }
+EXPORT_SYMBOL(search_itree);
 
 
 /**
@@ -1314,7 +1314,7 @@ static struct ldlm_lock *search_itree(struct ldlm_resource *res,
  * \retval a referenced lock or NULL.
  */
 static struct ldlm_lock *search_queue(struct list_head *queue,
-				      struct lock_match_data *data)
+				      struct ldlm_match_data *data)
 {
 	struct ldlm_lock *lock;
 	int rc;
@@ -1410,7 +1410,7 @@ enum ldlm_mode ldlm_lock_match_with_skip(struct ldlm_namespace *ns,
 					 enum ldlm_mode mode,
 					 struct lustre_handle *lockh, int unref)
 {
-	struct lock_match_data data = {
+	struct ldlm_match_data data = {
 		.lmd_old = NULL,
 		.lmd_lock = NULL,
 		.lmd_mode = &mode,
@@ -1418,6 +1418,7 @@ enum ldlm_mode ldlm_lock_match_with_skip(struct ldlm_namespace *ns,
 		.lmd_flags = flags,
 		.lmd_skip_flags = skip_flags,
 		.lmd_unref = unref,
+		.lmd_has_ast_data = false,
 	};
 	struct ldlm_resource *res;
 	struct ldlm_lock *lock;
diff --git a/lustre/osc/osc_lock.c b/lustre/osc/osc_lock.c
index 90811af..701818b6 100644
--- a/lustre/osc/osc_lock.c
+++ b/lustre/osc/osc_lock.c
@@ -554,6 +554,10 @@ int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
 	struct ost_lvb		*lvb;
 	struct req_capsule	*cap;
 	struct cl_object	*obj = NULL;
+	struct ldlm_resource	*res = dlmlock->l_resource;
+	struct ldlm_match_data  matchdata = { 0 };
+	union ldlm_policy_data  policy;
+	enum ldlm_mode		mode = LCK_PW | LCK_GROUP | LCK_PR;
 	int			result;
 	__u16			refcheck;
 
@@ -565,13 +569,40 @@ int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
 	if (IS_ERR(env))
 		GOTO(out, result = PTR_ERR(env));
 
+	policy.l_extent.start = 0;
+	policy.l_extent.end = LUSTRE_EOF;
 
-	lock_res_and_lock(dlmlock);
-	if (dlmlock->l_ast_data != NULL) {
-		obj = osc2cl(dlmlock->l_ast_data);
-		cl_object_get(obj);
+	matchdata.lmd_mode = &mode;
+	matchdata.lmd_policy = &policy;
+	matchdata.lmd_flags = LDLM_FL_TEST_LOCK | LDLM_FL_CBPENDING;
+	matchdata.lmd_unref = 1;
+	matchdata.lmd_has_ast_data = true;
+
+	LDLM_LOCK_GET(dlmlock);
+
+	/* If any dlmlock has l_ast_data set, we must find it or we risk
+	 * missing a size update done under a different lock.
+	 */
+	while (dlmlock) {
+		lock_res_and_lock(dlmlock);
+		if (dlmlock->l_ast_data) {
+			obj = osc2cl(dlmlock->l_ast_data);
+			cl_object_get(obj);
+		}
+		unlock_res_and_lock(dlmlock);
+		LDLM_LOCK_PUT(dlmlock);
+
+		dlmlock = NULL;
+
+		if (obj == NULL && res->lr_type == LDLM_EXTENT) {
+			if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_SIZE_DATA))
+				break;
+
+			lock_res(res);
+			dlmlock = search_itree(res, &matchdata);
+			unlock_res(res);
+		}
 	}
-	unlock_res_and_lock(dlmlock);
 
 	if (obj != NULL) {
 		/* Do not grab the mutex of cl_lock for glimpse.
diff --git a/lustre/target/tgt_handler.c b/lustre/target/tgt_handler.c
index 74b92e3..f75bcfa9 100644
--- a/lustre/target/tgt_handler.c
+++ b/lustre/target/tgt_handler.c
@@ -2508,6 +2508,9 @@ int tgt_brw_write(struct tgt_session_info *tsi)
 	CFS_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK, cfs_fail_val > 0 ?
 			 cfs_fail_val : (obd_timeout + 1) / 4);
 
+	/* Delay write commit to show stale size information */
+	CFS_FAIL_TIMEOUT(OBD_FAIL_OSC_NO_SIZE_DATA, cfs_fail_val);
+
 	/* There must be big cache in current thread to process this request
 	 * if it is NULL then something went wrong and it wasn't allocated,
 	 * report -ENOMEM in that case */
diff --git a/lustre/tests/lockahead_test.c b/lustre/tests/lockahead_test.c
index 07ce3ab..7fd180e 100644
--- a/lustre/tests/lockahead_test.c
+++ b/lustre/tests/lockahead_test.c
@@ -74,10 +74,13 @@
 
 /* Name of file/directory. Will be set once and will not change. */
 static char mainpath[PATH_MAX];
+/* Path to same file/directory on second mount */
+static char mainpath2[PATH_MAX];
 static char *mainfile;
 
 static char fsmountdir[PATH_MAX];	/* Lustre mountpoint */
 static char *lustre_dir;		/* Test directory inside Lustre */
+static char *lustre_dir2;		/* Same dir but on second mountpoint */
 static int single_test;			/* Number of a single test to execute*/
 
 /* Cleanup our test file. */
@@ -1077,9 +1080,125 @@ static int test22(void)
 	return 0;
 }
 
+/* Do lockahead requests from two mount points & sanity check size
+ *
+ * The key thing here is that client2 updates the size by writing, then asks
+ * for another lock beyond that.  That next lock is never used.
+ * The bug (LU-11670) is that the glimpse for client1 will only check the
+ * highest lock and miss the size update made by the lower lock.
+ */
+static int test23(void)
+{
+	struct llapi_lu_ladvise advice;
+	size_t write_size = 1024 * 1024;
+	char buf[write_size];
+	const int count = 1;
+	struct stat sb;
+	struct stat sb2;
+	int fd;
+	/* On second mount */
+	int fd2;
+	int rc;
+	int i;
+
+	fd = open(mainpath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
+	ASSERTF(fd >= 0, "open failed for '%s': %s",
+		mainpath, strerror(errno));
+
+	/* mainpath2 is a different Lustre mount */
+	fd2 = open(mainpath2, O_RDWR, S_IRUSR | S_IWUSR);
+	ASSERTF(fd2 >= 0, "open failed for '%s': %s",
+		mainpath2, strerror(errno));
+
+	/* Lock + write MiB 1 from second client */
+	setup_ladvise_lockahead(&advice, MODE_WRITE_USER, 0, 0,
+				write_size - 1, true);
+
+	/* Manually set the result so we can verify it's being modified */
+	advice.lla_lockahead_result = 345678;
+
+	rc = llapi_ladvise(fd2, 0, count, &advice);
+	ASSERTF(rc == 0,
+		"cannot lockahead '%s': %s", mainpath2, strerror(errno));
+	ASSERTF(advice.lla_lockahead_result == 0,
+		"unexpected extent result: %d",
+		advice.lla_lockahead_result);
+
+	/* Ask again until we get the lock (status 1). */
+	for (i = 1; i < 100; i++) {
+		usleep(100000); /* 0.1 second */
+		advice.lla_lockahead_result = 456789;
+		rc = llapi_ladvise(fd2, 0, count, &advice);
+		ASSERTF(rc == 0, "cannot lockahead '%s': %s",
+			mainpath2, strerror(errno));
+
+		if (advice.lla_lockahead_result > 0)
+			break;
+	}
+
+	ASSERTF(advice.lla_lockahead_result == LLA_RESULT_SAME,
+		"unexpected extent result: %d",
+		advice.lla_lockahead_result);
+
+	rc = write(fd2, buf, write_size);
+	ASSERTF(rc == sizeof(buf), "write failed for '%s': %s",
+		mainpath2, strerror(errno));
+
+	/* Lock (but don't write) MiB 2 from second client */
+	setup_ladvise_lockahead(&advice, MODE_WRITE_USER, 0, write_size,
+				2*write_size - 1, true);
+
+	/* Manually set the result so we can verify it's being modified */
+	advice.lla_lockahead_result = 345678;
+
+	rc = llapi_ladvise(fd2, 0, count, &advice);
+	ASSERTF(rc == 0,
+		"cannot lockahead '%s': %s", mainpath2, strerror(errno));
+	ASSERTF(advice.lla_lockahead_result == 0,
+		"unexpected extent result: %d",
+		advice.lla_lockahead_result);
+
+	/* Ask again until we get the lock (status 1). */
+	for (i = 1; i < 100; i++) {
+		usleep(100000); /* 0.1 second */
+		advice.lla_lockahead_result = 456789;
+		rc = llapi_ladvise(fd2, 0, count, &advice);
+		ASSERTF(rc == 0, "cannot lockahead '%s': %s",
+			mainpath2, strerror(errno));
+
+		if (advice.lla_lockahead_result > 0)
+			break;
+	}
+
+	ASSERTF(advice.lla_lockahead_result == LLA_RESULT_SAME,
+		"unexpected extent result: %d",
+		advice.lla_lockahead_result);
+
+	rc = fstat(fd, &sb);
+	ASSERTF(!rc, "stat failed for '%s': %s",
+		mainpath, strerror(errno));
+	rc = fstat(fd2, &sb2);
+	ASSERTF(!rc, "stat failed for '%s': %s",
+		mainpath2, strerror(errno));
+
+	ASSERTF(sb.st_size == sb2.st_size,
+		"size on %s and %s differs: %lu vs %lu",
+		mainpath, mainpath2, sb.st_size, sb2.st_size);
+
+	ASSERTF(sb.st_size == write_size, "size %lu != bytes written (%lu)",
+		sb.st_size, write_size);
+
+	close(fd);
+	close(fd2);
+
+	return 0;
+}
+
 static void usage(char *prog)
 {
-	fprintf(stderr, "Usage: %s [-d lustre_dir], [-t single_test]\n", prog);
+	fprintf(stderr,
+		"Usage: %s [-d lustre_dir], [-D lustre_dir2] [-t test]\n",
+		prog);
 	exit(-1);
 }
 
@@ -1087,7 +1206,7 @@ static void process_args(int argc, char *argv[])
 {
 	int c;
 
-	while ((c = getopt(argc, argv, "d:f:t:")) != -1) {
+	while ((c = getopt(argc, argv, "d:D:f:t:")) != -1) {
 		switch (c) {
 		case 'f':
 			mainfile = optarg;
@@ -1095,6 +1214,9 @@ static void process_args(int argc, char *argv[])
 		case 'd':
 			lustre_dir = optarg;
 			break;
+		case 'D':
+			lustre_dir2 = optarg;
+			break;
 		case 't':
 			single_test = atoi(optarg);
 			break;
@@ -1125,6 +1247,16 @@ int main(int argc, char *argv[])
 		return -1;
 	}
 
+	if (lustre_dir2) {
+		rc = llapi_search_mounts(lustre_dir2, 0, fsmountdir, fsname);
+		if (rc != 0) {
+			fprintf(stderr,
+				"Error: '%s': not a Lustre filesystem\n",
+				lustre_dir2);
+			return -1;
+		}
+	}
+
 	/* Play nice with Lustre test scripts. Non-line buffered output
 	 * stream under I/O redirection may appear incorrectly. */
 	setvbuf(stdout, NULL, _IOLBF, 0);
@@ -1133,6 +1265,14 @@ int main(int argc, char *argv[])
 	rc = snprintf(mainpath, sizeof(mainpath), "%s/%s", lustre_dir,
 		      mainfile);
 	ASSERTF(rc > 0 && rc < sizeof(mainpath), "invalid name for mainpath");
+
+	if (lustre_dir2) {
+		rc = snprintf(mainpath2, sizeof(mainpath2), "%s/%s",
+			      lustre_dir2, mainfile);
+		ASSERTF(rc > 0 && rc < sizeof(mainpath2),
+			"invalid name for mainpath2");
+	}
+
 	cleanup();
 
 	switch (single_test) {
@@ -1150,6 +1290,9 @@ int main(int argc, char *argv[])
 		PERFORM(test20);
 		PERFORM(test21);
 		PERFORM(test22);
+		/* Some tests require a second mount point */
+		if (lustre_dir2)
+			PERFORM(test23);
 		/* When running all the test cases, we can't use the return
 		 * from the last test case, as it might be non-zero to return
 		 * info, rather than for an error.  Test cases assert and exit
@@ -1195,6 +1338,11 @@ int main(int argc, char *argv[])
 	case 22:
 		PERFORM(test22);
 		break;
+	case 23:
+		ASSERTF(lustre_dir2,
+			"must provide second mount point for test 23");
+		PERFORM(test23);
+		break;
 	default:
 		fprintf(stderr, "impossible value of single_test %d\n",
 			single_test);
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh
index 98fa2ae..3ba63bc 100644
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -17568,7 +17568,7 @@ run_test 255b "check 'lfs ladvise -a dontneed'"
 
 test_255c() {
 	[ $OST1_VERSION -lt $(version_code 2.10.50) ] &&
-		skip "lustre < 2.10.53 does not support lockahead"
+		skip "lustre < 2.10.50 does not support lockahead"
 
 	local count
 	local new_count
diff --git a/lustre/tests/sanityn.sh b/lustre/tests/sanityn.sh
index 3ab4d04..7d40075 100755
--- a/lustre/tests/sanityn.sh
+++ b/lustre/tests/sanityn.sh
@@ -4742,6 +4742,57 @@ test_102() {
 }
 run_test 102 "Test open by handle of unlinked file"
 
+# Compare file size between first & second mount, ensuring the client correctly
+# glimpses even with unused speculative locks - LU-11670
+test_103() {
+	[ $(lustre_version_code $ost1) -lt $(version_code 2.10.50) ] &&
+		skip "Lockahead needs OST version at least 2.10.50"
+
+	local testnum=23
+
+	test_mkdir -p $DIR/$tdir
+
+	# Force file on to OST0
+	$LFS setstripe -i 0 $DIR/$tdir
+
+	# Do not check multiple locks on glimpse
+	# OBD_FAIL_OSC_NO_SIZE_DATA 0x415
+	$LCTL set_param fail_loc=0x415
+
+	# Delay write commit by 2 seconds to guarantee glimpse wins race
+	# The same fail_loc is used on client & server so it can work in the
+	# single node sanity setup
+	do_facet ost1 $LCTL set_param fail_loc=0x415 fail_val=2
+
+	echo "Incorrect size expected (no glimpse fix):"
+	lockahead_test -d $DIR/$tdir -D $DIR2/$tdir -t $testnum -f $tfile
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		error "Lockahead test $testnum passed with fail_loc set, ${rc}"
+	fi
+
+	# guarantee write commit timeout has expired
+	sleep 2
+
+	# Clear fail_loc on client
+	$LCTL set_param fail_loc=0
+
+	# Delay write commit by 2 seconds to guarantee glimpse wins race
+	# OBD_FAIL_OST_BRW_PAUSE_BULK 0x214
+	do_facet ost1 $LCTL set_param fail_loc=0x214 fail_val=2
+
+	# Write commit is still delayed by 2 seconds
+	lockahead_test -d $DIR/$tdir -D $DIR2/$tdir -t $testnum -f $tfile
+	rc=$?
+	[ $rc -eq 0 ] || error "Lockahead test${testnum} failed, ${rc}"
+
+	# guarantee write commit timeout has expired
+	sleep 2
+
+	rm -f $DIR/$tfile || error "unable to delete $DIR/$tfile"
+}
+run_test 103 "Test size correctness with lockahead"
+
 log "cleanup: ======================================================"
 
 # kill and wait in each test only guarentee script finish, but command in script