From: Andrew Perepechko Date: Tue, 17 Sep 2019 07:34:44 +0000 (+0300) Subject: LU-11426 llog: changelog records reordering X-Git-Tag: 2.12.90~97 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=1fa0a984c5c3863d8f40b3b0d63c3d08cfa1a9f0;hp=84ebd81d4295bc42fd78539aa1bca63a2b3a9295 LU-11426 llog: changelog records reordering Changelog records can get reordered because of a race window between cr_index generation and llog file space allocation. This can lead to llog records loss. llog_write() holds loghandle->lgh_lock semaphore, so it seems an appropriate place to generate a new changelog index. Change-Id: I034d1a696bde1d0f780e494ab65073e4018ceec9 Signed-off-by: Andrew Perepechko Reviewed-by: Alexander Boyko Reviewed-by: Alexander Zarochentsev Cray-bug-id: LUS-7691 Reviewed-on: https://review.whamcloud.com/36187 Reviewed-by: Alexandr Boyko Reviewed-by: Olaf Weber Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Mike Pershin Reviewed-by: Yingjin Qian Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 850fdd0..8a120a5 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -245,6 +245,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a #define OBD_FAIL_MDS_LLOG_CREATE_FAILED2 0x15b #define OBD_FAIL_MDS_FLD_LOOKUP 0x15c +#define OBD_FAIL_MDS_CHANGELOG_REORDER 0x15d #define OBD_FAIL_MDS_INTENT_DELAY 0x160 #define OBD_FAIL_MDS_XATTR_REP 0x161 #define OBD_FAIL_MDS_TRACK_OVERFLOW 0x162 diff --git a/lustre/mdd/mdd_device.c b/lustre/mdd/mdd_device.c index 54fe466..fef3dc1 100644 --- a/lustre/mdd/mdd_device.c +++ b/lustre/mdd/mdd_device.c @@ -376,6 +376,8 @@ static int llog_changelog_cancel(const struct lu_env *env, RETURN(rc); } +static struct llog_operations changelog_orig_logops; + static int mdd_changelog_write_header(const struct lu_env *env, struct mdd_device *mdd, int markerflags); @@ -440,7 +442,7 @@ static int mdd_changelog_llog_init(const struct lu_env *env, OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt); obd->obd_lvfs_ctxt.dt = mdd->mdd_bottom; rc = llog_setup(env, obd, &obd->obd_olg, LLOG_CHANGELOG_ORIG_CTXT, - obd, &llog_common_cat_ops); + obd, &changelog_orig_logops); if (rc) { CERROR("%s: changelog llog setup failed: rc = %d\n", obd->obd_name, rc); @@ -473,7 +475,7 @@ static int mdd_changelog_llog_init(const struct lu_env *env, /* setup user changelog */ rc = llog_setup(env, obd, &obd->obd_olg, LLOG_CHANGELOG_USER_ORIG_CTXT, - obd, &llog_common_cat_ops); + obd, &changelog_orig_logops); if (rc) { CERROR("%s: changelog users llog setup failed: rc = %d\n", obd->obd_name, rc); @@ -735,9 +737,6 @@ int mdd_changelog_write_header(const struct lu_env *env, rec->cr.cr_namelen); rec->cr_hdr.lrh_type = CHANGELOG_REC; rec->cr.cr_time = cl_time(); - spin_lock(&mdd->mdd_cl.mc_lock); - rec->cr.cr_index = ++mdd->mdd_cl.mc_index; - spin_unlock(&mdd->mdd_cl.mc_lock); ctxt = llog_get_context(obd, LLOG_CHANGELOG_ORIG_CTXT); LASSERT(ctxt); @@ -2008,6 +2007,9 @@ static int __init mdd_init(void) if (rc) return rc; + changelog_orig_logops = llog_common_cat_ops; + changelog_orig_logops.lop_write_rec = mdd_changelog_write_rec; + rc = class_register_type(&mdd_obd_device_ops, NULL, false, NULL, LUSTRE_MDD_NAME, &mdd_device_type); if (rc) diff --git a/lustre/mdd/mdd_dir.c b/lustre/mdd/mdd_dir.c index d069874..f074f23 100644 --- a/lustre/mdd/mdd_dir.c +++ b/lustre/mdd/mdd_dir.c @@ -784,6 +784,47 @@ out_put: return rc; } +int mdd_changelog_write_rec(const struct lu_env *env, + struct llog_handle *loghandle, + struct llog_rec_hdr *r, + struct llog_cookie *cookie, + int idx, struct thandle *th) +{ + int rc; + + if (r->lrh_type == CHANGELOG_REC) { + struct mdd_device *mdd; + struct llog_changelog_rec *rec; + + mdd = lu2mdd_dev(loghandle->lgh_ctxt->loc_obd->obd_lu_dev); + rec = container_of0(r, struct llog_changelog_rec, cr_hdr); + + spin_lock(&mdd->mdd_cl.mc_lock); + rec->cr.cr_index = mdd->mdd_cl.mc_index + 1; + spin_unlock(&mdd->mdd_cl.mc_lock); + + rc = llog_osd_ops.lop_write_rec(env, loghandle, r, + cookie, idx, th); + + /* + * if current llog is full, we will generate a new + * llog, and since it's actually not an error, let's + * avoid increasing index so that userspace apps + * should not see a gap in the changelog sequence + */ + if (!(rc == -ENOSPC && llog_is_full(loghandle))) { + spin_lock(&mdd->mdd_cl.mc_lock); + ++mdd->mdd_cl.mc_index; + spin_unlock(&mdd->mdd_cl.mc_lock); + } + } else { + rc = llog_osd_ops.lop_write_rec(env, loghandle, r, + cookie, idx, th); + } + + return rc; +} + /** Add a changelog entry \a rec to the changelog llog * \param mdd * \param rec @@ -806,13 +847,6 @@ int mdd_changelog_store(const struct lu_env *env, struct mdd_device *mdd, rec->cr_hdr.lrh_type = CHANGELOG_REC; rec->cr.cr_time = cl_time(); - spin_lock(&mdd->mdd_cl.mc_lock); - /* NB: I suppose it's possible llog_add adds out of order wrt cr_index, - * but as long as the MDD transactions are ordered correctly for e.g. - * rename conflicts, I don't think this should matter. */ - rec->cr.cr_index = ++mdd->mdd_cl.mc_index; - spin_unlock(&mdd->mdd_cl.mc_lock); - ctxt = llog_get_context(obd, LLOG_CHANGELOG_ORIG_CTXT); if (ctxt == NULL) return -ENXIO; @@ -821,6 +855,7 @@ int mdd_changelog_store(const struct lu_env *env, struct mdd_device *mdd, if (IS_ERR(llog_th)) GOTO(out_put, rc = PTR_ERR(llog_th)); + OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_CHANGELOG_REORDER, cfs_fail_val); /* nested journal transaction */ rc = llog_add(env, ctxt->loc_handle, &rec->cr_hdr, NULL, llog_th); diff --git a/lustre/mdd/mdd_internal.h b/lustre/mdd/mdd_internal.h index 1d70d20..cc2e6aa 100644 --- a/lustre/mdd/mdd_internal.h +++ b/lustre/mdd/mdd_internal.h @@ -279,6 +279,12 @@ int mdd_dir_layout_shrink(const struct lu_env *env, struct md_object *md_obj, const struct lu_buf *lmu_buf); +int mdd_changelog_write_rec(const struct lu_env *env, + struct llog_handle *loghandle, + struct llog_rec_hdr *rec, + struct llog_cookie *cookie, + int idx, struct thandle *th); + struct mdd_thread_info *mdd_env_info(const struct lu_env *env); #define MDD_ENV_VAR(env, var) (&mdd_env_info(env)->mti_##var) diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 80dcd28..46ae2a1 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -14133,6 +14133,34 @@ test_160j() { } run_test 160j "client can be umounted while its chanangelog is being used" +test_160k() { + [ $PARALLEL == "yes" ] && skip "skip parallel run" + remote_mds_nodsh && skip "remote MDS with nodsh" + + mkdir -p $DIR/$tdir/1/1 + + changelog_register || error "changelog_register failed" + local cl_user="${CL_USERS[$SINGLEMDS]%% *}" + + changelog_users $SINGLEMDS | grep -q $cl_user || + error "User '$cl_user' not found in changelog_users" +#define OBD_FAIL_MDS_CHANGELOG_REORDER 0x15d + do_facet mds1 $LCTL set_param fail_loc=0x8000015d fail_val=3 + rmdir $DIR/$tdir/1/1 & sleep 1 + mkdir $DIR/$tdir/2 + touch $DIR/$tdir/2/2 + rm -rf $DIR/$tdir/2 + + wait + sleep 4 + + changelog_dump | grep rmdir || error "rmdir not recorded" + + rm -rf $DIR/$tdir + changelog_deregister +} +run_test 160k "Verify that changelog records are not lost" + test_161a() { [ $PARALLEL == "yes" ] && skip "skip parallel run"