LU-11670 osc: glimpse - search for active lock

author Patrick Farrell <pfarrell@whamcloud.com>

Mon, 9 Sep 2019 15:56:07 +0000 (11:56 -0400)

committer Oleg Drokin <green@whamcloud.com>

Fri, 20 Sep 2019 07:50:58 +0000 (07:50 +0000)
author Patrick Farrell <pfarrell@whamcloud.com>
Mon, 9 Sep 2019 15:56:07 +0000 (11:56 -0400)
committer Oleg Drokin <green@whamcloud.com>
Fri, 20 Sep 2019 07:50:58 +0000 (07:50 +0000)
diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h

index cb265ae..6f6cf85 100644 (file)
--- a/lustre/include/lustre_dlm.h
+++ b/lustre/include/lustre_dlm.h
@@ -965,6 +965,20 @@ struct ldlm_lock {
         struct list_head        l_exp_list;
  };
  
+/**
+ * Describe the overlap between two locks.  itree_overlap_cb data.
+ */
+struct ldlm_match_data {
+       struct ldlm_lock        *lmd_old;
+       struct ldlm_lock        *lmd_lock;
+       enum ldlm_mode          *lmd_mode;
+       union ldlm_policy_data  *lmd_policy;
+       __u64                    lmd_flags;
+       __u64                    lmd_skip_flags;
+       int                      lmd_unref;
+       bool                     lmd_has_ast_data;
+};
+
  /** For uncommitted cross-MDT lock, store transno this lock belongs to */
  #define l_transno l_client_cookie
  
@@ -1543,7 +1557,8 @@ static inline enum ldlm_mode ldlm_lock_match(struct ldlm_namespace *ns,
         return ldlm_lock_match_with_skip(ns, flags, 0, res_id, type, policy,
                                          mode, lh, unref);
  }
-
+struct ldlm_lock *search_itree(struct ldlm_resource *res,
+                              struct ldlm_match_data *data);
  enum ldlm_mode ldlm_revalidate_lock_handle(const struct lustre_handle *lockh,
                                            __u64 *bits);
  void ldlm_lock_mode_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode);
diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h

index 6b73b9a..6ef38a0 100644 (file)
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -419,6 +419,7 @@ extern char obd_jobid_var[];
  #define OBD_FAIL_OSC_DELAY_SETTIME      0x412
  #define OBD_FAIL_OSC_CONNECT_GRANT_PARAM 0x413
  #define OBD_FAIL_OSC_DELAY_IO            0x414
+#define OBD_FAIL_OSC_NO_SIZE_DATA        0x415
  
  #define OBD_FAIL_PTLRPC                  0x500
  #define OBD_FAIL_PTLRPC_ACK              0x501
diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c

index 701a772..5baae7b 100644 (file)
--- a/lustre/ldlm/ldlm_lock.c
+++ b/lustre/ldlm/ldlm_lock.c
@@ -1155,29 +1155,16 @@ void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list)
  }
  
  /**
- * Describe the overlap between two locks.  itree_overlap_cb data.
- */
-struct lock_match_data {
-       struct ldlm_lock        *lmd_old;
-       struct ldlm_lock        *lmd_lock;
-       enum ldlm_mode          *lmd_mode;
-       union ldlm_policy_data  *lmd_policy;
-       __u64                    lmd_flags;
-       __u64                    lmd_skip_flags;
-       int                      lmd_unref;
-};
-
-/**
   * Check if the given @lock meets the criteria for a match.
   * A reference on the lock is taken if matched.
   *
   * \param lock     test-against this lock
   * \param data    parameters
   */
-static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
+static int lock_matches(struct ldlm_lock *lock, struct ldlm_match_data *data)
  {
         union ldlm_policy_data *lpol = &lock->l_policy_data;
-       enum ldlm_mode match;
+       enum ldlm_mode match = LCK_MINMODE;
  
         if (lock == data->lmd_old)
                 return INTERVAL_ITER_STOP;
@@ -1202,6 +1189,17 @@ static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
  
         if (!(lock->l_req_mode & *data->lmd_mode))
                 return INTERVAL_ITER_CONT;
+
+       /* When we search for ast_data, we are not doing a traditional match,
+        * so we don't worry about IBITS or extent matching.
+        */
+       if (data->lmd_has_ast_data) {
+               if (!lock->l_ast_data)
+                       return INTERVAL_ITER_CONT;
+
+               goto matched;
+       }
+
         match = lock->l_req_mode;
  
         switch (lock->l_resource->lr_type) {
@@ -1239,6 +1237,7 @@ static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
         if (data->lmd_skip_flags & lock->l_flags)
                 return INTERVAL_ITER_CONT;
  
+matched:
         if (data->lmd_flags & LDLM_FL_TEST_LOCK) {
                 LDLM_LOCK_GET(lock);
                 ldlm_lock_touch_in_lru(lock);
@@ -1255,7 +1254,7 @@ static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
  static unsigned int itree_overlap_cb(struct interval_node *in, void *args)
  {
         struct ldlm_interval *node = to_ldlm_interval(in);
-       struct lock_match_data *data = args;
+       struct ldlm_match_data *data = args;
         struct ldlm_lock *lock;
         int rc;
  
@@ -1275,8 +1274,8 @@ static unsigned int itree_overlap_cb(struct interval_node *in, void *args)
   *
   * \retval a referenced lock or NULL.
   */
-static struct ldlm_lock *search_itree(struct ldlm_resource *res,
-                                     struct lock_match_data *data)
+struct ldlm_lock *search_itree(struct ldlm_resource *res,
+                              struct ldlm_match_data *data)
  {
         struct interval_node_extent ext = {
                 .start     = data->lmd_policy->l_extent.start,
@@ -1303,6 +1302,7 @@ static struct ldlm_lock *search_itree(struct ldlm_resource *res,
  
         return NULL;
  }
+EXPORT_SYMBOL(search_itree);
  
  
  /**
@@ -1314,7 +1314,7 @@ static struct ldlm_lock *search_itree(struct ldlm_resource *res,
   * \retval a referenced lock or NULL.
   */
  static struct ldlm_lock *search_queue(struct list_head *queue,
-                                     struct lock_match_data *data)
+                                     struct ldlm_match_data *data)
  {
         struct ldlm_lock *lock;
         int rc;
@@ -1410,7 +1410,7 @@ enum ldlm_mode ldlm_lock_match_with_skip(struct ldlm_namespace *ns,
                                          enum ldlm_mode mode,
                                          struct lustre_handle *lockh, int unref)
  {
-       struct lock_match_data data = {
+       struct ldlm_match_data data = {
                 .lmd_old = NULL,
                 .lmd_lock = NULL,
                 .lmd_mode = &mode,
@@ -1418,6 +1418,7 @@ enum ldlm_mode ldlm_lock_match_with_skip(struct ldlm_namespace *ns,
                 .lmd_flags = flags,
                 .lmd_skip_flags = skip_flags,
                 .lmd_unref = unref,
+               .lmd_has_ast_data = false,
         };
         struct ldlm_resource *res;
         struct ldlm_lock *lock;
diff --git a/lustre/osc/osc_lock.c b/lustre/osc/osc_lock.c

index 90811af..701818b 100644 (file)
--- a/lustre/osc/osc_lock.c
+++ b/lustre/osc/osc_lock.c
@@ -554,6 +554,10 @@ int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
         struct ost_lvb          *lvb;
         struct req_capsule      *cap;
         struct cl_object        *obj = NULL;
+       struct ldlm_resource    *res = dlmlock->l_resource;
+       struct ldlm_match_data  matchdata = { 0 };
+       union ldlm_policy_data  policy;
+       enum ldlm_mode          mode = LCK_PW | LCK_GROUP | LCK_PR;
         int                     result;
         __u16                   refcheck;
  
@@ -565,13 +569,40 @@ int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
         if (IS_ERR(env))
                 GOTO(out, result = PTR_ERR(env));
  
+       policy.l_extent.start = 0;
+       policy.l_extent.end = LUSTRE_EOF;
  
-       lock_res_and_lock(dlmlock);
-       if (dlmlock->l_ast_data != NULL) {
-               obj = osc2cl(dlmlock->l_ast_data);
-               cl_object_get(obj);
+       matchdata.lmd_mode = &mode;
+       matchdata.lmd_policy = &policy;
+       matchdata.lmd_flags = LDLM_FL_TEST_LOCK | LDLM_FL_CBPENDING;
+       matchdata.lmd_unref = 1;
+       matchdata.lmd_has_ast_data = true;
+
+       LDLM_LOCK_GET(dlmlock);
+
+       /* If any dlmlock has l_ast_data set, we must find it or we risk
+        * missing a size update done under a different lock.
+        */
+       while (dlmlock) {
+               lock_res_and_lock(dlmlock);
+               if (dlmlock->l_ast_data) {
+                       obj = osc2cl(dlmlock->l_ast_data);
+                       cl_object_get(obj);
+               }
+               unlock_res_and_lock(dlmlock);
+               LDLM_LOCK_PUT(dlmlock);
+
+               dlmlock = NULL;
+
+               if (obj == NULL && res->lr_type == LDLM_EXTENT) {
+                       if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_SIZE_DATA))
+                               break;
+
+                       lock_res(res);
+                       dlmlock = search_itree(res, &matchdata);
+                       unlock_res(res);
+               }
         }
-       unlock_res_and_lock(dlmlock);
  
         if (obj != NULL) {
                 /* Do not grab the mutex of cl_lock for glimpse.
diff --git a/lustre/target/tgt_handler.c b/lustre/target/tgt_handler.c

index 74b92e3..f75bcfa 100644 (file)
--- a/lustre/target/tgt_handler.c
+++ b/lustre/target/tgt_handler.c
@@ -2508,6 +2508,9 @@ int tgt_brw_write(struct tgt_session_info *tsi)
         CFS_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK, cfs_fail_val > 0 ?
                          cfs_fail_val : (obd_timeout + 1) / 4);
  
+       /* Delay write commit to show stale size information */
+       CFS_FAIL_TIMEOUT(OBD_FAIL_OSC_NO_SIZE_DATA, cfs_fail_val);
+
         /* There must be big cache in current thread to process this request
          * if it is NULL then something went wrong and it wasn't allocated,
          * report -ENOMEM in that case */
diff --git a/lustre/tests/lockahead_test.c b/lustre/tests/lockahead_test.c

index 07ce3ab..7fd180e 100644 (file)
--- a/lustre/tests/lockahead_test.c
+++ b/lustre/tests/lockahead_test.c
@@ -74,10 +74,13 @@
  
  /* Name of file/directory. Will be set once and will not change. */
  static char mainpath[PATH_MAX];
+/* Path to same file/directory on second mount */
+static char mainpath2[PATH_MAX];
  static char *mainfile;
  
  static char fsmountdir[PATH_MAX];      /* Lustre mountpoint */
  static char *lustre_dir;               /* Test directory inside Lustre */
+static char *lustre_dir2;              /* Same dir but on second mountpoint */
  static int single_test;                        /* Number of a single test to execute*/
  
  /* Cleanup our test file. */
@@ -1077,9 +1080,125 @@ static int test22(void)
         return 0;
  }
  
+/* Do lockahead requests from two mount points & sanity check size
+ *
+ * The key thing here is that client2 updates the size by writing, then asks
+ * for another lock beyond that.  That next lock is never used.
+ * The bug (LU-11670) is that the glimpse for client1 will only check the
+ * highest lock and miss the size update made by the lower lock.
+ */
+static int test23(void)
+{
+       struct llapi_lu_ladvise advice;
+       size_t write_size = 1024 * 1024;
+       char buf[write_size];
+       const int count = 1;
+       struct stat sb;
+       struct stat sb2;
+       int fd;
+       /* On second mount */
+       int fd2;
+       int rc;
+       int i;
+
+       fd = open(mainpath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
+       ASSERTF(fd >= 0, "open failed for '%s': %s",
+               mainpath, strerror(errno));
+
+       /* mainpath2 is a different Lustre mount */
+       fd2 = open(mainpath2, O_RDWR, S_IRUSR | S_IWUSR);
+       ASSERTF(fd2 >= 0, "open failed for '%s': %s",
+               mainpath2, strerror(errno));
+
+       /* Lock + write MiB 1 from second client */
+       setup_ladvise_lockahead(&advice, MODE_WRITE_USER, 0, 0,
+                               write_size - 1, true);
+
+       /* Manually set the result so we can verify it's being modified */
+       advice.lla_lockahead_result = 345678;
+
+       rc = llapi_ladvise(fd2, 0, count, &advice);
+       ASSERTF(rc == 0,
+               "cannot lockahead '%s': %s", mainpath2, strerror(errno));
+       ASSERTF(advice.lla_lockahead_result == 0,
+               "unexpected extent result: %d",
+               advice.lla_lockahead_result);
+
+       /* Ask again until we get the lock (status 1). */
+       for (i = 1; i < 100; i++) {
+               usleep(100000); /* 0.1 second */
+               advice.lla_lockahead_result = 456789;
+               rc = llapi_ladvise(fd2, 0, count, &advice);
+               ASSERTF(rc == 0, "cannot lockahead '%s': %s",
+                       mainpath2, strerror(errno));
+
+               if (advice.lla_lockahead_result > 0)
+                       break;
+       }
+
+       ASSERTF(advice.lla_lockahead_result == LLA_RESULT_SAME,
+               "unexpected extent result: %d",
+               advice.lla_lockahead_result);
+
+       rc = write(fd2, buf, write_size);
+       ASSERTF(rc == sizeof(buf), "write failed for '%s': %s",
+               mainpath2, strerror(errno));
+
+       /* Lock (but don't write) MiB 2 from second client */
+       setup_ladvise_lockahead(&advice, MODE_WRITE_USER, 0, write_size,
+                               2*write_size - 1, true);
+
+       /* Manually set the result so we can verify it's being modified */
+       advice.lla_lockahead_result = 345678;
+
+       rc = llapi_ladvise(fd2, 0, count, &advice);
+       ASSERTF(rc == 0,
+               "cannot lockahead '%s': %s", mainpath2, strerror(errno));
+       ASSERTF(advice.lla_lockahead_result == 0,
+               "unexpected extent result: %d",
+               advice.lla_lockahead_result);
+
+       /* Ask again until we get the lock (status 1). */
+       for (i = 1; i < 100; i++) {
+               usleep(100000); /* 0.1 second */
+               advice.lla_lockahead_result = 456789;
+               rc = llapi_ladvise(fd2, 0, count, &advice);
+               ASSERTF(rc == 0, "cannot lockahead '%s': %s",
+                       mainpath2, strerror(errno));
+
+               if (advice.lla_lockahead_result > 0)
+                       break;
+       }
+
+       ASSERTF(advice.lla_lockahead_result == LLA_RESULT_SAME,
+               "unexpected extent result: %d",
+               advice.lla_lockahead_result);
+
+       rc = fstat(fd, &sb);
+       ASSERTF(!rc, "stat failed for '%s': %s",
+               mainpath, strerror(errno));
+       rc = fstat(fd2, &sb2);
+       ASSERTF(!rc, "stat failed for '%s': %s",
+               mainpath2, strerror(errno));
+
+       ASSERTF(sb.st_size == sb2.st_size,
+               "size on %s and %s differs: %lu vs %lu",
+               mainpath, mainpath2, sb.st_size, sb2.st_size);
+
+       ASSERTF(sb.st_size == write_size, "size %lu != bytes written (%lu)",
+               sb.st_size, write_size);
+
+       close(fd);
+       close(fd2);
+
+       return 0;
+}
+
  static void usage(char *prog)
  {
-       fprintf(stderr, "Usage: %s [-d lustre_dir], [-t single_test]\n", prog);
+       fprintf(stderr,
+               "Usage: %s [-d lustre_dir], [-D lustre_dir2] [-t test]\n",
+               prog);
         exit(-1);
  }
  
@@ -1087,7 +1206,7 @@ static void process_args(int argc, char *argv[])
  {
         int c;
  
-       while ((c = getopt(argc, argv, "d:f:t:")) != -1) {
+       while ((c = getopt(argc, argv, "d:D:f:t:")) != -1) {
                 switch (c) {
                 case 'f':
                         mainfile = optarg;
@@ -1095,6 +1214,9 @@ static void process_args(int argc, char *argv[])
                 case 'd':
                         lustre_dir = optarg;
                         break;
+               case 'D':
+                       lustre_dir2 = optarg;
+                       break;
                 case 't':
                         single_test = atoi(optarg);
                         break;
@@ -1125,6 +1247,16 @@ int main(int argc, char *argv[])
                 return -1;
         }
  
+       if (lustre_dir2) {
+               rc = llapi_search_mounts(lustre_dir2, 0, fsmountdir, fsname);
+               if (rc != 0) {
+                       fprintf(stderr,
+                               "Error: '%s': not a Lustre filesystem\n",
+                               lustre_dir2);
+                       return -1;
+               }
+       }
+
         /* Play nice with Lustre test scripts. Non-line buffered output
          * stream under I/O redirection may appear incorrectly. */
         setvbuf(stdout, NULL, _IOLBF, 0);
@@ -1133,6 +1265,14 @@ int main(int argc, char *argv[])
         rc = snprintf(mainpath, sizeof(mainpath), "%s/%s", lustre_dir,
                       mainfile);
         ASSERTF(rc > 0 && rc < sizeof(mainpath), "invalid name for mainpath");
+
+       if (lustre_dir2) {
+               rc = snprintf(mainpath2, sizeof(mainpath2), "%s/%s",
+                             lustre_dir2, mainfile);
+               ASSERTF(rc > 0 && rc < sizeof(mainpath2),
+                       "invalid name for mainpath2");
+       }
+
         cleanup();
  
         switch (single_test) {
@@ -1150,6 +1290,9 @@ int main(int argc, char *argv[])
                 PERFORM(test20);
                 PERFORM(test21);
                 PERFORM(test22);
+               /* Some tests require a second mount point */
+               if (lustre_dir2)
+                       PERFORM(test23);
                 /* When running all the test cases, we can't use the return
                  * from the last test case, as it might be non-zero to return
                  * info, rather than for an error.  Test cases assert and exit
@@ -1195,6 +1338,11 @@ int main(int argc, char *argv[])
         case 22:
                 PERFORM(test22);
                 break;
+       case 23:
+               ASSERTF(lustre_dir2,
+                       "must provide second mount point for test 23");
+               PERFORM(test23);
+               break;
         default:
                 fprintf(stderr, "impossible value of single_test %d\n",
                         single_test);
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh

index 98fa2ae..3ba63bc 100644 (file)
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -17568,7 +17568,7 @@ run_test 255b "check 'lfs ladvise -a dontneed'"
  
  test_255c() {
         [ $OST1_VERSION -lt $(version_code 2.10.50) ] &&
-               skip "lustre < 2.10.53 does not support lockahead"
+               skip "lustre < 2.10.50 does not support lockahead"
  
         local count
         local new_count
diff --git a/lustre/tests/sanityn.sh b/lustre/tests/sanityn.sh

index 3ab4d04..7d40075 100755 (executable)
--- a/lustre/tests/sanityn.sh
+++ b/lustre/tests/sanityn.sh
@@ -4742,6 +4742,57 @@ test_102() {
  }
  run_test 102 "Test open by handle of unlinked file"
  
+# Compare file size between first & second mount, ensuring the client correctly
+# glimpses even with unused speculative locks - LU-11670
+test_103() {
+       [ $(lustre_version_code $ost1) -lt $(version_code 2.10.50) ] &&
+               skip "Lockahead needs OST version at least 2.10.50"
+
+       local testnum=23
+
+       test_mkdir -p $DIR/$tdir
+
+       # Force file on to OST0
+       $LFS setstripe -i 0 $DIR/$tdir
+
+       # Do not check multiple locks on glimpse
+       # OBD_FAIL_OSC_NO_SIZE_DATA 0x415
+       $LCTL set_param fail_loc=0x415
+
+       # Delay write commit by 2 seconds to guarantee glimpse wins race
+       # The same fail_loc is used on client & server so it can work in the
+       # single node sanity setup
+       do_facet ost1 $LCTL set_param fail_loc=0x415 fail_val=2
+
+       echo "Incorrect size expected (no glimpse fix):"
+       lockahead_test -d $DIR/$tdir -D $DIR2/$tdir -t $testnum -f $tfile
+       rc=$?
+       if [ $rc -eq 0 ]; then
+               error "Lockahead test $testnum passed with fail_loc set, ${rc}"
+       fi
+
+       # guarantee write commit timeout has expired
+       sleep 2
+
+       # Clear fail_loc on client
+       $LCTL set_param fail_loc=0
+
+       # Delay write commit by 2 seconds to guarantee glimpse wins race
+       # OBD_FAIL_OST_BRW_PAUSE_BULK 0x214
+       do_facet ost1 $LCTL set_param fail_loc=0x214 fail_val=2
+
+       # Write commit is still delayed by 2 seconds
+       lockahead_test -d $DIR/$tdir -D $DIR2/$tdir -t $testnum -f $tfile
+       rc=$?
+       [ $rc -eq 0 ] || error "Lockahead test${testnum} failed, ${rc}"
+
+       # guarantee write commit timeout has expired
+       sleep 2
+
+       rm -f $DIR/$tfile || error "unable to delete $DIR/$tfile"
+}
+run_test 103 "Test size correctness with lockahead"
+
  log "cleanup: ======================================================"
  
  # kill and wait in each test only guarentee script finish, but command in script
author	Patrick Farrell <pfarrell@whamcloud.com>
	Mon, 9 Sep 2019 15:56:07 +0000 (11:56 -0400)
committer	Oleg Drokin <green@whamcloud.com>
	Fri, 20 Sep 2019 07:50:58 +0000 (07:50 +0000)
lustre/include/lustre_dlm.h		patch \| blob \| history
lustre/include/obd_support.h		patch \| blob \| history
lustre/ldlm/ldlm_lock.c		patch \| blob \| history
lustre/osc/osc_lock.c		patch \| blob \| history
lustre/target/tgt_handler.c		patch \| blob \| history
lustre/tests/lockahead_test.c		patch \| blob \| history
lustre/tests/sanity.sh		patch \| blob \| history
lustre/tests/sanityn.sh		patch \| blob \| history