Whamcloud - gitweb
LU-11670 osc: glimpse - search for active lock 06/36406/4
authorPatrick Farrell <pfarrell@whamcloud.com>
Mon, 9 Sep 2019 15:56:07 +0000 (11:56 -0400)
committerOleg Drokin <green@whamcloud.com>
Thu, 12 Dec 2019 23:05:15 +0000 (23:05 +0000)
When there are lock-ahead write locks on a file, the server
sends one glimpse AST RPC to each client having such (it
may have many) locks. This callback is sent to the lock
having the highest offset.

Client's glimpse callback goes up to the clio layers and
gets the global (not lock-specific) view of size.  The clio
layers are connected to the extent lock through the
l_ast_data (which points to the OSC object).

Speculative locks (AGL, lockahead) do not have l_ast_data
initialised until an IO happens under the lock. Thus, some
speculative locks may not have l_ast_data initialized.

It is possible for the client to do a write using one lock
(changing file size), but for the glimpse AST to be sent to
another lock without l_ast_data initialized.  Currently, a
lock with no l_ast_data set returns ELDLM_NO_LOCK_DATA to
the server.  In this case, this means we do not return the
updated size.

The solution is to search the granted lock tree for any lock
with initialized l_ast_data (it points to the OSC object
which is the same for all the extent locks) and to reach the
clio layers for the size through this lock instead.

Lustre-change: https://review.whamcloud.com/33660
Lustre-commit: b3461d11dcb04670cc2e1bfbb99306cfd3f645ef

cray-bug-id: LUS-6747
Signed-off-by: Patrick Farrell <pfarrell@whamcloud.com>
Change-Id: I6c60f4133154a3d6652315f155af24bbc5752dd2
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Bobi Jam <bobijam@hotmail.com>
Signed-off-by: Minh Diep <mdiep@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/36406
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/lustre_dlm.h
lustre/include/obd_support.h
lustre/ldlm/ldlm_lock.c
lustre/osc/osc_lock.c
lustre/target/tgt_handler.c
lustre/tests/lockahead_test.c
lustre/tests/sanity.sh
lustre/tests/sanityn.sh

index 9c7a8fa..1babb16 100644 (file)
@@ -962,6 +962,20 @@ struct ldlm_lock {
        struct list_head        l_exp_list;
 };
 
+/**
+ * Describe the overlap between two locks.  itree_overlap_cb data.
+ */
+struct ldlm_match_data {
+       struct ldlm_lock        *lmd_old;
+       struct ldlm_lock        *lmd_lock;
+       enum ldlm_mode          *lmd_mode;
+       union ldlm_policy_data  *lmd_policy;
+       __u64                    lmd_flags;
+       __u64                    lmd_skip_flags;
+       int                      lmd_unref;
+       bool                     lmd_has_ast_data;
+};
+
 /** For uncommitted cross-MDT lock, store transno this lock belongs to */
 #define l_transno l_client_cookie
 
@@ -1539,7 +1553,8 @@ static inline enum ldlm_mode ldlm_lock_match(struct ldlm_namespace *ns,
        return ldlm_lock_match_with_skip(ns, flags, 0, res_id, type, policy,
                                         mode, lh, unref);
 }
-
+struct ldlm_lock *search_itree(struct ldlm_resource *res,
+                              struct ldlm_match_data *data);
 enum ldlm_mode ldlm_revalidate_lock_handle(const struct lustre_handle *lockh,
                                           __u64 *bits);
 void ldlm_lock_mode_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode);
index 8417f3c..e595223 100644 (file)
@@ -414,6 +414,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OSC_DELAY_SETTIME      0x412
 #define OBD_FAIL_OSC_CONNECT_GRANT_PARAM 0x413
 #define OBD_FAIL_OSC_DELAY_IO            0x414
+#define OBD_FAIL_OSC_NO_SIZE_DATA        0x415
 
 #define OBD_FAIL_PTLRPC                  0x500
 #define OBD_FAIL_PTLRPC_ACK              0x501
index 4e21617..3b403ff 100644 (file)
@@ -1157,29 +1157,16 @@ void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list)
 }
 
 /**
- * Describe the overlap between two locks.  itree_overlap_cb data.
- */
-struct lock_match_data {
-       struct ldlm_lock        *lmd_old;
-       struct ldlm_lock        *lmd_lock;
-       enum ldlm_mode          *lmd_mode;
-       union ldlm_policy_data  *lmd_policy;
-       __u64                    lmd_flags;
-       __u64                    lmd_skip_flags;
-       int                      lmd_unref;
-};
-
-/**
  * Check if the given @lock meets the criteria for a match.
  * A reference on the lock is taken if matched.
  *
  * \param lock     test-against this lock
  * \param data    parameters
  */
-static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
+static int lock_matches(struct ldlm_lock *lock, struct ldlm_match_data *data)
 {
        union ldlm_policy_data *lpol = &lock->l_policy_data;
-       enum ldlm_mode match;
+       enum ldlm_mode match = LCK_MINMODE;
 
        if (lock == data->lmd_old)
                return INTERVAL_ITER_STOP;
@@ -1204,6 +1191,17 @@ static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
 
        if (!(lock->l_req_mode & *data->lmd_mode))
                return INTERVAL_ITER_CONT;
+
+       /* When we search for ast_data, we are not doing a traditional match,
+        * so we don't worry about IBITS or extent matching.
+        */
+       if (data->lmd_has_ast_data) {
+               if (!lock->l_ast_data)
+                       return INTERVAL_ITER_CONT;
+
+               goto matched;
+       }
+
        match = lock->l_req_mode;
 
        switch (lock->l_resource->lr_type) {
@@ -1241,6 +1239,7 @@ static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
        if (data->lmd_skip_flags & lock->l_flags)
                return INTERVAL_ITER_CONT;
 
+matched:
        if (data->lmd_flags & LDLM_FL_TEST_LOCK) {
                LDLM_LOCK_GET(lock);
                ldlm_lock_touch_in_lru(lock);
@@ -1257,7 +1256,7 @@ static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
 static unsigned int itree_overlap_cb(struct interval_node *in, void *args)
 {
        struct ldlm_interval *node = to_ldlm_interval(in);
-       struct lock_match_data *data = args;
+       struct ldlm_match_data *data = args;
        struct ldlm_lock *lock;
        int rc;
 
@@ -1277,8 +1276,8 @@ static unsigned int itree_overlap_cb(struct interval_node *in, void *args)
  *
  * \retval a referenced lock or NULL.
  */
-static struct ldlm_lock *search_itree(struct ldlm_resource *res,
-                                     struct lock_match_data *data)
+struct ldlm_lock *search_itree(struct ldlm_resource *res,
+                              struct ldlm_match_data *data)
 {
        struct interval_node_extent ext = {
                .start     = data->lmd_policy->l_extent.start,
@@ -1300,6 +1299,7 @@ static struct ldlm_lock *search_itree(struct ldlm_resource *res,
        }
        return data->lmd_lock;
 }
+EXPORT_SYMBOL(search_itree);
 
 
 /**
@@ -1311,7 +1311,7 @@ static struct ldlm_lock *search_itree(struct ldlm_resource *res,
  * \retval a referenced lock or NULL.
  */
 static struct ldlm_lock *search_queue(struct list_head *queue,
-                                     struct lock_match_data *data)
+                                     struct ldlm_match_data *data)
 {
        struct ldlm_lock *lock;
        int rc;
@@ -1404,7 +1404,7 @@ enum ldlm_mode ldlm_lock_match_with_skip(struct ldlm_namespace *ns,
                                         enum ldlm_mode mode,
                                         struct lustre_handle *lockh, int unref)
 {
-       struct lock_match_data data = {
+       struct ldlm_match_data data = {
                .lmd_old = NULL,
                .lmd_lock = NULL,
                .lmd_mode = &mode,
@@ -1412,6 +1412,7 @@ enum ldlm_mode ldlm_lock_match_with_skip(struct ldlm_namespace *ns,
                .lmd_flags = flags,
                .lmd_skip_flags = skip_flags,
                .lmd_unref = unref,
+               .lmd_has_ast_data = false,
        };
        struct ldlm_resource *res;
        struct ldlm_lock *lock;
index aaadaab..ced5c41 100644 (file)
@@ -554,6 +554,10 @@ int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
        struct ost_lvb          *lvb;
        struct req_capsule      *cap;
        struct cl_object        *obj = NULL;
+       struct ldlm_resource    *res = dlmlock->l_resource;
+       struct ldlm_match_data  matchdata = { 0 };
+       union ldlm_policy_data  policy;
+       enum ldlm_mode          mode = LCK_PW | LCK_GROUP | LCK_PR;
        int                     result;
        __u16                   refcheck;
 
@@ -565,13 +569,40 @@ int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
        if (IS_ERR(env))
                GOTO(out, result = PTR_ERR(env));
 
+       policy.l_extent.start = 0;
+       policy.l_extent.end = LUSTRE_EOF;
 
-       lock_res_and_lock(dlmlock);
-       if (dlmlock->l_ast_data != NULL) {
-               obj = osc2cl(dlmlock->l_ast_data);
-               cl_object_get(obj);
+       matchdata.lmd_mode = &mode;
+       matchdata.lmd_policy = &policy;
+       matchdata.lmd_flags = LDLM_FL_TEST_LOCK | LDLM_FL_CBPENDING;
+       matchdata.lmd_unref = 1;
+       matchdata.lmd_has_ast_data = true;
+
+       LDLM_LOCK_GET(dlmlock);
+
+       /* If any dlmlock has l_ast_data set, we must find it or we risk
+        * missing a size update done under a different lock.
+        */
+       while (dlmlock) {
+               lock_res_and_lock(dlmlock);
+               if (dlmlock->l_ast_data) {
+                       obj = osc2cl(dlmlock->l_ast_data);
+                       cl_object_get(obj);
+               }
+               unlock_res_and_lock(dlmlock);
+               LDLM_LOCK_PUT(dlmlock);
+
+               dlmlock = NULL;
+
+               if (obj == NULL && res->lr_type == LDLM_EXTENT) {
+                       if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_SIZE_DATA))
+                               break;
+
+                       lock_res(res);
+                       dlmlock = search_itree(res, &matchdata);
+                       unlock_res(res);
+               }
        }
-       unlock_res_and_lock(dlmlock);
 
        if (obj != NULL) {
                /* Do not grab the mutex of cl_lock for glimpse.
index 2ed8a02..85a8699 100644 (file)
@@ -2507,6 +2507,9 @@ int tgt_brw_write(struct tgt_session_info *tsi)
        CFS_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK, cfs_fail_val > 0 ?
                         cfs_fail_val : (obd_timeout + 1) / 4);
 
+       /* Delay write commit to show stale size information */
+       CFS_FAIL_TIMEOUT(OBD_FAIL_OSC_NO_SIZE_DATA, cfs_fail_val);
+
        /* There must be big cache in current thread to process this request
         * if it is NULL then something went wrong and it wasn't allocated,
         * report -ENOMEM in that case */
index 07ce3ab..7fd180e 100644 (file)
 
 /* Name of file/directory. Will be set once and will not change. */
 static char mainpath[PATH_MAX];
+/* Path to same file/directory on second mount */
+static char mainpath2[PATH_MAX];
 static char *mainfile;
 
 static char fsmountdir[PATH_MAX];      /* Lustre mountpoint */
 static char *lustre_dir;               /* Test directory inside Lustre */
+static char *lustre_dir2;              /* Same dir but on second mountpoint */
 static int single_test;                        /* Number of a single test to execute*/
 
 /* Cleanup our test file. */
@@ -1077,9 +1080,125 @@ static int test22(void)
        return 0;
 }
 
+/* Do lockahead requests from two mount points & sanity check size
+ *
+ * The key thing here is that client2 updates the size by writing, then asks
+ * for another lock beyond that.  That next lock is never used.
+ * The bug (LU-11670) is that the glimpse for client1 will only check the
+ * highest lock and miss the size update made by the lower lock.
+ */
+static int test23(void)
+{
+       struct llapi_lu_ladvise advice;
+       size_t write_size = 1024 * 1024;
+       char buf[write_size];
+       const int count = 1;
+       struct stat sb;
+       struct stat sb2;
+       int fd;
+       /* On second mount */
+       int fd2;
+       int rc;
+       int i;
+
+       fd = open(mainpath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
+       ASSERTF(fd >= 0, "open failed for '%s': %s",
+               mainpath, strerror(errno));
+
+       /* mainpath2 is a different Lustre mount */
+       fd2 = open(mainpath2, O_RDWR, S_IRUSR | S_IWUSR);
+       ASSERTF(fd2 >= 0, "open failed for '%s': %s",
+               mainpath2, strerror(errno));
+
+       /* Lock + write MiB 1 from second client */
+       setup_ladvise_lockahead(&advice, MODE_WRITE_USER, 0, 0,
+                               write_size - 1, true);
+
+       /* Manually set the result so we can verify it's being modified */
+       advice.lla_lockahead_result = 345678;
+
+       rc = llapi_ladvise(fd2, 0, count, &advice);
+       ASSERTF(rc == 0,
+               "cannot lockahead '%s': %s", mainpath2, strerror(errno));
+       ASSERTF(advice.lla_lockahead_result == 0,
+               "unexpected extent result: %d",
+               advice.lla_lockahead_result);
+
+       /* Ask again until we get the lock (status 1). */
+       for (i = 1; i < 100; i++) {
+               usleep(100000); /* 0.1 second */
+               advice.lla_lockahead_result = 456789;
+               rc = llapi_ladvise(fd2, 0, count, &advice);
+               ASSERTF(rc == 0, "cannot lockahead '%s': %s",
+                       mainpath2, strerror(errno));
+
+               if (advice.lla_lockahead_result > 0)
+                       break;
+       }
+
+       ASSERTF(advice.lla_lockahead_result == LLA_RESULT_SAME,
+               "unexpected extent result: %d",
+               advice.lla_lockahead_result);
+
+       rc = write(fd2, buf, write_size);
+       ASSERTF(rc == sizeof(buf), "write failed for '%s': %s",
+               mainpath2, strerror(errno));
+
+       /* Lock (but don't write) MiB 2 from second client */
+       setup_ladvise_lockahead(&advice, MODE_WRITE_USER, 0, write_size,
+                               2*write_size - 1, true);
+
+       /* Manually set the result so we can verify it's being modified */
+       advice.lla_lockahead_result = 345678;
+
+       rc = llapi_ladvise(fd2, 0, count, &advice);
+       ASSERTF(rc == 0,
+               "cannot lockahead '%s': %s", mainpath2, strerror(errno));
+       ASSERTF(advice.lla_lockahead_result == 0,
+               "unexpected extent result: %d",
+               advice.lla_lockahead_result);
+
+       /* Ask again until we get the lock (status 1). */
+       for (i = 1; i < 100; i++) {
+               usleep(100000); /* 0.1 second */
+               advice.lla_lockahead_result = 456789;
+               rc = llapi_ladvise(fd2, 0, count, &advice);
+               ASSERTF(rc == 0, "cannot lockahead '%s': %s",
+                       mainpath2, strerror(errno));
+
+               if (advice.lla_lockahead_result > 0)
+                       break;
+       }
+
+       ASSERTF(advice.lla_lockahead_result == LLA_RESULT_SAME,
+               "unexpected extent result: %d",
+               advice.lla_lockahead_result);
+
+       rc = fstat(fd, &sb);
+       ASSERTF(!rc, "stat failed for '%s': %s",
+               mainpath, strerror(errno));
+       rc = fstat(fd2, &sb2);
+       ASSERTF(!rc, "stat failed for '%s': %s",
+               mainpath2, strerror(errno));
+
+       ASSERTF(sb.st_size == sb2.st_size,
+               "size on %s and %s differs: %lu vs %lu",
+               mainpath, mainpath2, sb.st_size, sb2.st_size);
+
+       ASSERTF(sb.st_size == write_size, "size %lu != bytes written (%lu)",
+               sb.st_size, write_size);
+
+       close(fd);
+       close(fd2);
+
+       return 0;
+}
+
 static void usage(char *prog)
 {
-       fprintf(stderr, "Usage: %s [-d lustre_dir], [-t single_test]\n", prog);
+       fprintf(stderr,
+               "Usage: %s [-d lustre_dir], [-D lustre_dir2] [-t test]\n",
+               prog);
        exit(-1);
 }
 
@@ -1087,7 +1206,7 @@ static void process_args(int argc, char *argv[])
 {
        int c;
 
-       while ((c = getopt(argc, argv, "d:f:t:")) != -1) {
+       while ((c = getopt(argc, argv, "d:D:f:t:")) != -1) {
                switch (c) {
                case 'f':
                        mainfile = optarg;
@@ -1095,6 +1214,9 @@ static void process_args(int argc, char *argv[])
                case 'd':
                        lustre_dir = optarg;
                        break;
+               case 'D':
+                       lustre_dir2 = optarg;
+                       break;
                case 't':
                        single_test = atoi(optarg);
                        break;
@@ -1125,6 +1247,16 @@ int main(int argc, char *argv[])
                return -1;
        }
 
+       if (lustre_dir2) {
+               rc = llapi_search_mounts(lustre_dir2, 0, fsmountdir, fsname);
+               if (rc != 0) {
+                       fprintf(stderr,
+                               "Error: '%s': not a Lustre filesystem\n",
+                               lustre_dir2);
+                       return -1;
+               }
+       }
+
        /* Play nice with Lustre test scripts. Non-line buffered output
         * stream under I/O redirection may appear incorrectly. */
        setvbuf(stdout, NULL, _IOLBF, 0);
@@ -1133,6 +1265,14 @@ int main(int argc, char *argv[])
        rc = snprintf(mainpath, sizeof(mainpath), "%s/%s", lustre_dir,
                      mainfile);
        ASSERTF(rc > 0 && rc < sizeof(mainpath), "invalid name for mainpath");
+
+       if (lustre_dir2) {
+               rc = snprintf(mainpath2, sizeof(mainpath2), "%s/%s",
+                             lustre_dir2, mainfile);
+               ASSERTF(rc > 0 && rc < sizeof(mainpath2),
+                       "invalid name for mainpath2");
+       }
+
        cleanup();
 
        switch (single_test) {
@@ -1150,6 +1290,9 @@ int main(int argc, char *argv[])
                PERFORM(test20);
                PERFORM(test21);
                PERFORM(test22);
+               /* Some tests require a second mount point */
+               if (lustre_dir2)
+                       PERFORM(test23);
                /* When running all the test cases, we can't use the return
                 * from the last test case, as it might be non-zero to return
                 * info, rather than for an error.  Test cases assert and exit
@@ -1195,6 +1338,11 @@ int main(int argc, char *argv[])
        case 22:
                PERFORM(test22);
                break;
+       case 23:
+               ASSERTF(lustre_dir2,
+                       "must provide second mount point for test 23");
+               PERFORM(test23);
+               break;
        default:
                fprintf(stderr, "impossible value of single_test %d\n",
                        single_test);
index 2f15240..b9461dc 100644 (file)
@@ -17100,7 +17100,7 @@ run_test 255b "check 'lfs ladvise -a dontneed'"
 
 test_255c() {
        [ $OST1_VERSION -lt $(version_code 2.10.50) ] &&
-               skip "lustre < 2.10.53 does not support lockahead"
+               skip "lustre < 2.10.50 does not support lockahead"
 
        local count
        local new_count
index 99b0bd8..490126f 100755 (executable)
@@ -4592,6 +4592,57 @@ test_102() {
 }
 run_test 102 "Test open by handle of unlinked file"
 
+# Compare file size between first & second mount, ensuring the client correctly
+# glimpses even with unused speculative locks - LU-11670
+test_103() {
+       [ $(lustre_version_code $ost1) -lt $(version_code 2.10.50) ] &&
+               skip "Lockahead needs OST version at least 2.10.50"
+
+       local testnum=23
+
+       test_mkdir -p $DIR/$tdir
+
+       # Force file on to OST0
+       $LFS setstripe -i 0 $DIR/$tdir
+
+       # Do not check multiple locks on glimpse
+       # OBD_FAIL_OSC_NO_SIZE_DATA 0x415
+       $LCTL set_param fail_loc=0x415
+
+       # Delay write commit by 2 seconds to guarantee glimpse wins race
+       # The same fail_loc is used on client & server so it can work in the
+       # single node sanity setup
+       do_facet ost1 $LCTL set_param fail_loc=0x415 fail_val=2
+
+       echo "Incorrect size expected (no glimpse fix):"
+       lockahead_test -d $DIR/$tdir -D $DIR2/$tdir -t $testnum -f $tfile
+       rc=$?
+       if [ $rc -eq 0 ]; then
+               error "Lockahead test $testnum passed with fail_loc set, ${rc}"
+       fi
+
+       # guarantee write commit timeout has expired
+       sleep 2
+
+       # Clear fail_loc on client
+       $LCTL set_param fail_loc=0
+
+       # Delay write commit by 2 seconds to guarantee glimpse wins race
+       # OBD_FAIL_OST_BRW_PAUSE_BULK 0x214
+       do_facet ost1 $LCTL set_param fail_loc=0x214 fail_val=2
+
+       # Write commit is still delayed by 2 seconds
+       lockahead_test -d $DIR/$tdir -D $DIR2/$tdir -t $testnum -f $tfile
+       rc=$?
+       [ $rc -eq 0 ] || error "Lockahead test${testnum} failed, ${rc}"
+
+       # guarantee write commit timeout has expired
+       sleep 2
+
+       rm -f $DIR/$tfile || error "unable to delete $DIR/$tfile"
+}
+run_test 103 "Test size correctness with lockahead"
+
 log "cleanup: ======================================================"
 
 # kill and wait in each test only guarentee script finish, but command in script