Whamcloud - gitweb
LU-11426 llog: changelog records reordering 87/36187/5
authorAndrew Perepechko <c17827@cray.com>
Tue, 17 Sep 2019 07:34:44 +0000 (10:34 +0300)
committerOleg Drokin <green@whamcloud.com>
Fri, 27 Sep 2019 23:11:55 +0000 (23:11 +0000)
Changelog records can get reordered because of a race
window between cr_index generation and llog file
space allocation. This can lead to llog records
loss.

llog_write() holds loghandle->lgh_lock semaphore,
so it seems an appropriate place to generate a
new changelog index.

Change-Id: I034d1a696bde1d0f780e494ab65073e4018ceec9
Signed-off-by: Andrew Perepechko <c17827@cray.com>
Reviewed-by: Alexander Boyko <c17825@cray.com>
Reviewed-by: Alexander Zarochentsev <c17826@cray.com>
Cray-bug-id: LUS-7691
Reviewed-on: https://review.whamcloud.com/36187
Reviewed-by: Alexandr Boyko <c17825@cray.com>
Reviewed-by: Olaf Weber <olaf.weber@hpe.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Mike Pershin <mpershin@whamcloud.com>
Reviewed-by: Yingjin Qian <qian@ddn.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/mdd/mdd_device.c
lustre/mdd/mdd_dir.c
lustre/mdd/mdd_internal.h
lustre/tests/sanity.sh

index 850fdd0..8a120a5 100644 (file)
@@ -245,6 +245,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a
 #define OBD_FAIL_MDS_LLOG_CREATE_FAILED2 0x15b
 #define OBD_FAIL_MDS_FLD_LOOKUP                        0x15c
 #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a
 #define OBD_FAIL_MDS_LLOG_CREATE_FAILED2 0x15b
 #define OBD_FAIL_MDS_FLD_LOOKUP                        0x15c
+#define OBD_FAIL_MDS_CHANGELOG_REORDER 0x15d
 #define OBD_FAIL_MDS_INTENT_DELAY              0x160
 #define OBD_FAIL_MDS_XATTR_REP                 0x161
 #define OBD_FAIL_MDS_TRACK_OVERFLOW     0x162
 #define OBD_FAIL_MDS_INTENT_DELAY              0x160
 #define OBD_FAIL_MDS_XATTR_REP                 0x161
 #define OBD_FAIL_MDS_TRACK_OVERFLOW     0x162
index 54fe466..fef3dc1 100644 (file)
@@ -376,6 +376,8 @@ static int llog_changelog_cancel(const struct lu_env *env,
        RETURN(rc);
 }
 
        RETURN(rc);
 }
 
+static struct llog_operations changelog_orig_logops;
+
 static int
 mdd_changelog_write_header(const struct lu_env *env, struct mdd_device *mdd,
                           int markerflags);
 static int
 mdd_changelog_write_header(const struct lu_env *env, struct mdd_device *mdd,
                           int markerflags);
@@ -440,7 +442,7 @@ static int mdd_changelog_llog_init(const struct lu_env *env,
        OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt);
        obd->obd_lvfs_ctxt.dt = mdd->mdd_bottom;
        rc = llog_setup(env, obd, &obd->obd_olg, LLOG_CHANGELOG_ORIG_CTXT,
        OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt);
        obd->obd_lvfs_ctxt.dt = mdd->mdd_bottom;
        rc = llog_setup(env, obd, &obd->obd_olg, LLOG_CHANGELOG_ORIG_CTXT,
-                       obd, &llog_common_cat_ops);
+                       obd, &changelog_orig_logops);
        if (rc) {
                CERROR("%s: changelog llog setup failed: rc = %d\n",
                       obd->obd_name, rc);
        if (rc) {
                CERROR("%s: changelog llog setup failed: rc = %d\n",
                       obd->obd_name, rc);
@@ -473,7 +475,7 @@ static int mdd_changelog_llog_init(const struct lu_env *env,
 
        /* setup user changelog */
        rc = llog_setup(env, obd, &obd->obd_olg, LLOG_CHANGELOG_USER_ORIG_CTXT,
 
        /* setup user changelog */
        rc = llog_setup(env, obd, &obd->obd_olg, LLOG_CHANGELOG_USER_ORIG_CTXT,
-                       obd, &llog_common_cat_ops);
+                       obd, &changelog_orig_logops);
        if (rc) {
                CERROR("%s: changelog users llog setup failed: rc = %d\n",
                       obd->obd_name, rc);
        if (rc) {
                CERROR("%s: changelog users llog setup failed: rc = %d\n",
                       obd->obd_name, rc);
@@ -735,9 +737,6 @@ int mdd_changelog_write_header(const struct lu_env *env,
                                            rec->cr.cr_namelen);
        rec->cr_hdr.lrh_type = CHANGELOG_REC;
        rec->cr.cr_time = cl_time();
                                            rec->cr.cr_namelen);
        rec->cr_hdr.lrh_type = CHANGELOG_REC;
        rec->cr.cr_time = cl_time();
-       spin_lock(&mdd->mdd_cl.mc_lock);
-       rec->cr.cr_index = ++mdd->mdd_cl.mc_index;
-       spin_unlock(&mdd->mdd_cl.mc_lock);
 
        ctxt = llog_get_context(obd, LLOG_CHANGELOG_ORIG_CTXT);
        LASSERT(ctxt);
 
        ctxt = llog_get_context(obd, LLOG_CHANGELOG_ORIG_CTXT);
        LASSERT(ctxt);
@@ -2008,6 +2007,9 @@ static int __init mdd_init(void)
        if (rc)
                return rc;
 
        if (rc)
                return rc;
 
+       changelog_orig_logops = llog_common_cat_ops;
+       changelog_orig_logops.lop_write_rec = mdd_changelog_write_rec;
+
        rc = class_register_type(&mdd_obd_device_ops, NULL, false, NULL,
                                 LUSTRE_MDD_NAME, &mdd_device_type);
        if (rc)
        rc = class_register_type(&mdd_obd_device_ops, NULL, false, NULL,
                                 LUSTRE_MDD_NAME, &mdd_device_type);
        if (rc)
index d069874..f074f23 100644 (file)
@@ -784,6 +784,47 @@ out_put:
        return rc;
 }
 
        return rc;
 }
 
+int mdd_changelog_write_rec(const struct lu_env *env,
+                           struct llog_handle *loghandle,
+                           struct llog_rec_hdr *r,
+                           struct llog_cookie *cookie,
+                           int idx, struct thandle *th)
+{
+       int rc;
+
+       if (r->lrh_type == CHANGELOG_REC) {
+               struct mdd_device *mdd;
+               struct llog_changelog_rec *rec;
+
+               mdd = lu2mdd_dev(loghandle->lgh_ctxt->loc_obd->obd_lu_dev);
+               rec = container_of0(r, struct llog_changelog_rec, cr_hdr);
+
+               spin_lock(&mdd->mdd_cl.mc_lock);
+               rec->cr.cr_index = mdd->mdd_cl.mc_index + 1;
+               spin_unlock(&mdd->mdd_cl.mc_lock);
+
+               rc = llog_osd_ops.lop_write_rec(env, loghandle, r,
+                                               cookie, idx, th);
+
+               /*
+                * if current llog is full, we will generate a new
+                * llog, and since it's actually not an error, let's
+                * avoid increasing index so that userspace apps
+                * should not see a gap in the changelog sequence
+                */
+               if (!(rc == -ENOSPC && llog_is_full(loghandle))) {
+                       spin_lock(&mdd->mdd_cl.mc_lock);
+                       ++mdd->mdd_cl.mc_index;
+                       spin_unlock(&mdd->mdd_cl.mc_lock);
+               }
+       } else {
+               rc = llog_osd_ops.lop_write_rec(env, loghandle, r,
+                                               cookie, idx, th);
+       }
+
+       return rc;
+}
+
 /** Add a changelog entry \a rec to the changelog llog
  * \param mdd
  * \param rec
 /** Add a changelog entry \a rec to the changelog llog
  * \param mdd
  * \param rec
@@ -806,13 +847,6 @@ int mdd_changelog_store(const struct lu_env *env, struct mdd_device *mdd,
        rec->cr_hdr.lrh_type = CHANGELOG_REC;
        rec->cr.cr_time = cl_time();
 
        rec->cr_hdr.lrh_type = CHANGELOG_REC;
        rec->cr.cr_time = cl_time();
 
-       spin_lock(&mdd->mdd_cl.mc_lock);
-       /* NB: I suppose it's possible llog_add adds out of order wrt cr_index,
-        * but as long as the MDD transactions are ordered correctly for e.g.
-        * rename conflicts, I don't think this should matter. */
-       rec->cr.cr_index = ++mdd->mdd_cl.mc_index;
-       spin_unlock(&mdd->mdd_cl.mc_lock);
-
        ctxt = llog_get_context(obd, LLOG_CHANGELOG_ORIG_CTXT);
        if (ctxt == NULL)
                return -ENXIO;
        ctxt = llog_get_context(obd, LLOG_CHANGELOG_ORIG_CTXT);
        if (ctxt == NULL)
                return -ENXIO;
@@ -821,6 +855,7 @@ int mdd_changelog_store(const struct lu_env *env, struct mdd_device *mdd,
        if (IS_ERR(llog_th))
                GOTO(out_put, rc = PTR_ERR(llog_th));
 
        if (IS_ERR(llog_th))
                GOTO(out_put, rc = PTR_ERR(llog_th));
 
+       OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_CHANGELOG_REORDER, cfs_fail_val);
        /* nested journal transaction */
        rc = llog_add(env, ctxt->loc_handle, &rec->cr_hdr, NULL, llog_th);
 
        /* nested journal transaction */
        rc = llog_add(env, ctxt->loc_handle, &rec->cr_hdr, NULL, llog_th);
 
index 1d70d20..cc2e6aa 100644 (file)
@@ -279,6 +279,12 @@ int mdd_dir_layout_shrink(const struct lu_env *env,
                          struct md_object *md_obj,
                          const struct lu_buf *lmu_buf);
 
                          struct md_object *md_obj,
                          const struct lu_buf *lmu_buf);
 
+int mdd_changelog_write_rec(const struct lu_env *env,
+                           struct llog_handle *loghandle,
+                           struct llog_rec_hdr *rec,
+                           struct llog_cookie *cookie,
+                           int idx, struct thandle *th);
+
 struct mdd_thread_info *mdd_env_info(const struct lu_env *env);
 
 #define MDD_ENV_VAR(env, var) (&mdd_env_info(env)->mti_##var)
 struct mdd_thread_info *mdd_env_info(const struct lu_env *env);
 
 #define MDD_ENV_VAR(env, var) (&mdd_env_info(env)->mti_##var)
index 80dcd28..46ae2a1 100755 (executable)
@@ -14133,6 +14133,34 @@ test_160j() {
 }
 run_test 160j "client can be umounted  while its chanangelog is being used"
 
 }
 run_test 160j "client can be umounted  while its chanangelog is being used"
 
+test_160k() {
+       [ $PARALLEL == "yes" ] && skip "skip parallel run"
+       remote_mds_nodsh && skip "remote MDS with nodsh"
+
+       mkdir -p $DIR/$tdir/1/1
+
+       changelog_register || error "changelog_register failed"
+       local cl_user="${CL_USERS[$SINGLEMDS]%% *}"
+
+       changelog_users $SINGLEMDS | grep -q $cl_user ||
+               error "User '$cl_user' not found in changelog_users"
+#define OBD_FAIL_MDS_CHANGELOG_REORDER 0x15d
+       do_facet mds1 $LCTL set_param fail_loc=0x8000015d fail_val=3
+       rmdir $DIR/$tdir/1/1 & sleep 1
+       mkdir $DIR/$tdir/2
+       touch $DIR/$tdir/2/2
+       rm -rf $DIR/$tdir/2
+
+       wait
+       sleep 4
+
+       changelog_dump | grep rmdir || error "rmdir not recorded"
+
+       rm -rf $DIR/$tdir
+       changelog_deregister
+}
+run_test 160k "Verify that changelog records are not lost"
+
 test_161a() {
        [ $PARALLEL == "yes" ] && skip "skip parallel run"
 
 test_161a() {
        [ $PARALLEL == "yes" ] && skip "skip parallel run"