From f60b307c5001e1d9035af61d2344af33d3ea0f85 Mon Sep 17 00:00:00 2001 From: Mikhail Pershin Date: Fri, 24 Sep 2021 18:47:44 +0300 Subject: [PATCH] LU-14699 mdd: proactive changelog garbage collection Currently changelog starts garbage collection when user exceeds maximum idle timeout, there is also limit by amount of idle records but it is used only for old changelog users which have no cur_time field, therefore it is not used at all nowadays. Another problem is that garbage collection is started only when changelog is almost full. That causes often situations when changelog might have very old users staying much longer than idle timeout and having idle records above maximum limit consuming space for nothing. Patch reworks changelog GC in the following way: - GC starts when changelog is almost full (old way) or either idle time or idle records limits are exceeded or when (idle_time * idle_records) exceeds its limit as well. The latest limit is calculated as: (idle_time * idle_records) / 84600 > (1 << 32) which is a reasonable heuristic for deciding if a user is "too idle" in both cases when lots records being created quickly vs user is idle a very long time. - to avoid the processing of changelog users each time GC is checking all conditions both least user record and time are tracked when changelog users are initialized or purged/canceled. Both values are stored as mdd_changelog fields mc_minrec and mc_mintime - test 160g is changed to test the new approach when idle indexes are checked always along with idle time checks - test 160s is added in sanity.sh to check heuristic approach with (idle_time * idle_records) value checking Fixes: 3442db6faf68 ("LU-7340 mdd: changelogs garbage collection") Signed-off-by: Mikhail Pershin Change-Id: I6028f3164212a2377a4fc45b60a826c64f859099 Reviewed-on: https://review.whamcloud.com/45068 Reviewed-by: Andreas Dilger Reviewed-by: John L. Hammond Tested-by: jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- lustre/include/obd_support.h | 1 + lustre/mdd/mdd_device.c | 180 ++++++++++++++++++++++--------------------- lustre/mdd/mdd_dir.c | 38 ++++++--- lustre/mdd/mdd_internal.h | 13 ++++ lustre/mdd/mdd_trans.c | 104 ++++++++----------------- lustre/tests/sanity.sh | 128 +++++++++++++++++++++++------- 6 files changed, 269 insertions(+), 195 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 723b331..5716784 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -246,6 +246,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_MDS_REINT_OPEN2 0x16a #define OBD_FAIL_MDS_COMMITRW_DELAY 0x16b #define OBD_FAIL_MDS_CHANGELOG_DEL 0x16c +#define OBD_FAIL_MDS_CHANGELOG_IDX_PUMP 0x16d /* layout lock */ #define OBD_FAIL_MDS_NO_LL_GETATTR 0x170 diff --git a/lustre/mdd/mdd_device.c b/lustre/mdd/mdd_device.c index f24c208..7f65dc8 100644 --- a/lustre/mdd/mdd_device.c +++ b/lustre/mdd/mdd_device.c @@ -256,6 +256,8 @@ static int changelog_user_init_cb(const struct lu_env *env, mdd->mdd_cl.mc_current_mask |= rec->cur_mask; else if (mdd->mdd_cl.mc_proc_mask == CHANGELOG_MINMASK) mdd->mdd_cl.mc_current_mask |= CHANGELOG_DEFMASK; + mdd->mdd_cl.mc_mintime = min(mdd->mdd_cl.mc_mintime, rec->cur_time); + mdd->mdd_cl.mc_minrec = min(mdd->mdd_cl.mc_minrec, rec->cur_endrec); spin_unlock(&mdd->mdd_cl.mc_user_lock); spin_lock(&mdd->mdd_cl.mc_lock); if (rec->cur_endrec > mdd->mdd_cl.mc_index) @@ -652,6 +654,8 @@ static int mdd_changelog_init(const struct lu_env *env, struct mdd_device *mdd) /* ensure a GC check will, and a thread run may, occur upon start */ mdd->mdd_cl.mc_gc_time = 0; mdd->mdd_cl.mc_gc_task = MDD_CHLG_GC_NONE; + mdd->mdd_cl.mc_mintime = (__u32)ktime_get_real_seconds(); + mdd->mdd_cl.mc_minrec = ULLONG_MAX; rc = mdd_changelog_llog_init(env, mdd); if (rc) { @@ -718,7 +722,8 @@ again: } } -/** Remove entries with indicies up to and including \a endrec from the +/** + * Remove entries with indicies up to and including \a endrec from the * changelog * \param mdd * \param endrec @@ -726,50 +731,48 @@ again: */ static int mdd_changelog_llog_cancel(const struct lu_env *env, struct mdd_device *mdd, - long long endrec) + unsigned long long endrec) { - struct obd_device *obd = mdd2obd_dev(mdd); - struct llog_ctxt *ctxt; - long long unsigned cur; + struct obd_device *obd = mdd2obd_dev(mdd); + struct llog_ctxt *ctxt; + unsigned long long cur; struct changelog_cancel_cookie cookie; - int rc; + int rc; - ctxt = llog_get_context(obd, LLOG_CHANGELOG_ORIG_CTXT); - if (ctxt == NULL) - return -ENXIO; + ctxt = llog_get_context(obd, LLOG_CHANGELOG_ORIG_CTXT); + if (!ctxt) + return -ENXIO; spin_lock(&mdd->mdd_cl.mc_lock); cur = (long long)mdd->mdd_cl.mc_index; spin_unlock(&mdd->mdd_cl.mc_lock); - if (endrec > cur) - endrec = cur; - - /* purge to "0" is shorthand for everything */ - if (endrec == 0) - endrec = cur; - - /* If purging all records, write a header entry so we don't have an - empty catalog and we're sure to have a valid starting index next - time. In case of crash, we just restart with old log so we're - allright. */ - if (endrec == cur) { - /* XXX: transaction is started by llog itself */ - rc = mdd_changelog_write_header(env, mdd, CLM_PURGE); - if (rc) - goto out; - } - /* Some records were purged, so reset repeat-access time (so we - record new mtime update records, so users can see a file has been - changed since the last purge) */ + /* + * If purging all records, write a header entry so we don't have an + * empty catalog and we're sure to have a valid starting index next + * time. In a case of crash, we just restart with old log so we're + * allright. + */ + if (endrec >= cur) { + rc = mdd_changelog_write_header(env, mdd, CLM_PURGE); + if (rc) + goto out; + endrec = cur; + } + + /* + * Some records were purged, so reset repeat-access time (so we + * record new mtime update records, so users can see a file has been + * changed since the last purge) + */ mdd->mdd_cl.mc_starttime = ktime_get(); cookie.endrec = endrec; cookie.mdd = mdd; rc = llog_changelog_cancel(env, ctxt, &cookie); out: - llog_ctxt_put(ctxt); - return rc; + llog_ctxt_put(ctxt); + return rc; } /** Add a CL_MARK record to the changelog @@ -1768,8 +1771,12 @@ static int mdd_changelog_user_register(const struct lu_env *env, spin_unlock(&mdd->mdd_cl.mc_user_lock); rec->cur_time = (__u32)ktime_get_real_seconds(); - if (OBD_FAIL_CHECK(OBD_FAIL_TIME_IN_CHLOG_USER)) - rec->cur_time = 0; + if (OBD_FAIL_PRECHECK(OBD_FAIL_TIME_IN_CHLOG_USER)) { + rec->cur_time -= min(cfs_fail_val, rec->cur_time); + spin_lock(&mdd->mdd_cl.mc_user_lock); + mdd->mdd_cl.mc_mintime = rec->cur_time; + spin_unlock(&mdd->mdd_cl.mc_user_lock); + } spin_lock(&mdd->mdd_cl.mc_lock); rec->cur_endrec = mdd->mdd_cl.mc_index; @@ -1901,6 +1908,7 @@ struct mdd_changelog_user_purge { __u32 mcup_usercount; __u64 mcup_minrec; bool mcup_found; + char mcup_name[CHANGELOG_USER_NAMELEN_FULL]; }; /** @@ -1936,10 +1944,11 @@ static int mdd_changelog_user_purge_cb(const struct lu_env *env, RETURN(0); } + mdd_chlg_username(rec, mcup->mcup_name, sizeof(mcup->mcup_name)); + /* Unregister this user */ cookie.lgc_lgl = llh->lgh_id; cookie.lgc_index = hdr->lrh_index; - rc = llog_cat_cancel_records(env, llh->u.phd.phd_cat_handle, 1, &cookie); if (rc == 0) { @@ -1962,6 +1971,7 @@ int mdd_changelog_user_purge(const struct lu_env *env, .mcup_found = false, .mcup_usercount = 0, .mcup_minrec = ULLONG_MAX, + .mcup_name = { 0 }, }; struct llog_ctxt *ctxt; int rc; @@ -1973,17 +1983,21 @@ int mdd_changelog_user_purge(const struct lu_env *env, ctxt = llog_get_context(mdd2obd_dev(mdd), LLOG_CHANGELOG_USER_ORIG_CTXT); - if (ctxt == NULL || - (ctxt->loc_handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) == 0) + if (!ctxt) + RETURN(-ENXIO); + if (!(ctxt->loc_handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) GOTO(out, rc = -ENXIO); rc = llog_cat_process(env, ctxt->loc_handle, - mdd_changelog_user_purge_cb, &mcup, - 0, 0); + mdd_changelog_user_purge_cb, &mcup, 0, 0); + if (rc) { + CWARN("%s: failed to purge changelog for user %s: rc = %d\n", + mdd2obd_dev(mdd)->obd_name, mcup.mcup_name, rc); + GOTO(out, rc); + } OBD_FAIL_TIMEOUT(OBD_FAIL_LLOG_PURGE_DELAY, cfs_fail_val); - - if ((rc == 0) && (mcup.mcup_usercount == 0)) { + if (mcup.mcup_usercount == 0) { spin_lock(&mdd->mdd_cl.mc_user_lock); if (mdd->mdd_cl.mc_users == 0) { /* No more users; turn changelogs off */ @@ -1993,18 +2007,14 @@ int mdd_changelog_user_purge(const struct lu_env *env, spin_unlock(&mdd->mdd_cl.mc_user_lock); } - if ((rc == 0) && mcup.mcup_found) { - CDEBUG(D_IOCTL, "%s: Purging changelog entries for user %d " - "record=%llu\n", - mdd2obd_dev(mdd)->obd_name, id, mcup.mcup_minrec); - /* Cancelling record 0 destroys the entire changelog, make sure - we don't do that unless we mean it. */ - if (mcup.mcup_minrec != 0 || mcup.mcup_usercount == 0) { - rc = mdd_changelog_llog_cancel(env, mdd, - mcup.mcup_minrec); - } + if (mcup.mcup_found) { + CDEBUG(D_IOCTL, + "%s: Purge changelog entries for user %s record=%llu\n", + mdd2obd_dev(mdd)->obd_name, + mcup.mcup_name, mcup.mcup_minrec); + rc = mdd_changelog_llog_cancel(env, mdd, mcup.mcup_minrec); } else { - CWARN("%s: No changelog for user %u; rc=%d\n", + CWARN("%s: No changelog for user id %u: rc = %d\n", mdd2obd_dev(mdd)->obd_name, id, rc); GOTO(out, rc = -ENOENT); } @@ -2013,8 +2023,7 @@ int mdd_changelog_user_purge(const struct lu_env *env, EXIT; out: - if (ctxt != NULL) - llog_ctxt_put(ctxt); + llog_ctxt_put(ctxt); return rc; } @@ -2022,9 +2031,11 @@ out: struct mdd_changelog_user_clear { __u64 mcuc_endrec; __u64 mcuc_minrec; + __u32 mcuc_mintime; __u32 mcuc_id; bool mcuc_flush; struct mdd_device *mcuc_mdd; + char mcuc_name[CHANGELOG_USER_NAMELEN_FULL]; }; /** @@ -2043,7 +2054,6 @@ static int mdd_changelog_clear_cb(const struct lu_env *env, { struct llog_changelog_user_rec2 *rec; struct mdd_changelog_user_clear *mcuc = data; - char user_name[CHANGELOG_USER_NAMELEN_FULL]; struct mdd_device *mdd = mcuc->mcuc_mdd; int rc; @@ -2055,38 +2065,33 @@ static int mdd_changelog_clear_cb(const struct lu_env *env, rec = container_of(hdr, typeof(*rec), cur_hdr); /* Does the changelog id match the requested id? */ if (rec->cur_id != mcuc->mcuc_id) { - mcuc->mcuc_minrec = min(mcuc->mcuc_minrec, - rec->cur_endrec); + mcuc->mcuc_minrec = min(mcuc->mcuc_minrec, rec->cur_endrec); + mcuc->mcuc_mintime = min(mcuc->mcuc_mintime, rec->cur_time); RETURN(0); } + mdd_chlg_username(rec, mcuc->mcuc_name, sizeof(mcuc->mcuc_name)); /* cur_endrec is the oldest purgeable record, make sure we're newer */ if (rec->cur_endrec > mcuc->mcuc_endrec) { rc = -EINVAL; CDEBUG(D_IOCTL, - "%s: request %llu > endrec %llu for user %s: rc = %d\n", - mdd2obd_dev(mdd)->obd_name, - mcuc->mcuc_endrec, rec->cur_endrec, - mdd_chlg_username(rec, user_name, sizeof(user_name)), - rc); + "%s: request %llu < endrec %llu for user %s: rc = %d\n", + mdd2obd_dev(mdd)->obd_name, mcuc->mcuc_endrec, + rec->cur_endrec, mcuc->mcuc_name, rc); RETURN(rc); } - /* Flag that we've met all the range and user checks. + /* + * Flag that we've met all the range and user checks. * We now know the record to flush. */ - rec->cur_endrec = mcuc->mcuc_endrec; + mcuc->mcuc_flush = true; + rec->cur_endrec = mcuc->mcuc_endrec; rec->cur_time = (__u32)ktime_get_real_seconds(); - if (OBD_FAIL_CHECK(OBD_FAIL_TIME_IN_CHLOG_USER)) - rec->cur_time = 0; - - mcuc->mcuc_flush = true; - CDEBUG(D_IOCTL, "%s: rewriting changelog user %s endrec = %llu\n", - mdd2obd_dev(mdd)->obd_name, - mdd_chlg_username(rec, user_name, sizeof(user_name)), - rec->cur_endrec); + CDEBUG(D_IOCTL, "%s: update changelog user %s endrec = %llu\n", + mdd2obd_dev(mdd)->obd_name, mcuc->mcuc_name, rec->cur_endrec); /* Update the endrec */ rc = llog_write(env, llh, hdr, hdr->lrh_index); @@ -2106,6 +2111,8 @@ static int mdd_changelog_clear(const struct lu_env *env, .mcuc_minrec = endrec, .mcuc_flush = false, .mcuc_mdd = mdd, + .mcuc_mintime = ktime_get_real_seconds(), + .mcuc_name = { 0 }, }; struct llog_ctxt *ctxt; __u64 start_rec; @@ -2139,27 +2146,26 @@ static int mdd_changelog_clear(const struct lu_env *env, (ctxt->loc_handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) == 0) GOTO(out, rc = -ENXIO); - rc = llog_cat_process(env, ctxt->loc_handle, - mdd_changelog_clear_cb, (void *)&mcuc, - 0, 0); - + rc = llog_cat_process(env, ctxt->loc_handle, mdd_changelog_clear_cb, + &mcuc, 0, 0); if (rc == -EINVAL) { CDEBUG(D_IOCTL, "%s: No changelog recnum <= %llu to clear\n", - mdd2obd_dev(mdd)->obd_name, (unsigned long long) endrec); + mdd2obd_dev(mdd)->obd_name, (unsigned long long)endrec); RETURN(-EINVAL); } else if (rc < 0) { - CWARN("%s: Failure to clear the changelog for user %d: %d\n", - mdd2obd_dev(mdd)->obd_name, id, rc); + CWARN("%s: can't clear the changelog for user %s: rc = %d\n", + mdd2obd_dev(mdd)->obd_name, mcuc.mcuc_name, rc); } else if (mcuc.mcuc_flush) { - /* Cancelling record 0 destroys the entire changelog, make sure - we don't do that unless we mean it. */ - if (mcuc.mcuc_minrec != 0) { - CDEBUG(D_IOCTL, "%s: Purging changelog entries up "\ - "to %llu\n", mdd2obd_dev(mdd)->obd_name, - mcuc.mcuc_minrec); - - rc = mdd_changelog_llog_cancel(env, mdd, - mcuc.mcuc_minrec); + CDEBUG(D_IOCTL, + "%s: purge changelog user %s entries up to %llu\n", + mdd2obd_dev(mdd)->obd_name, mcuc.mcuc_name, + mcuc.mcuc_minrec); + rc = mdd_changelog_llog_cancel(env, mdd, mcuc.mcuc_minrec); + if (!rc) { + spin_lock(&mdd->mdd_cl.mc_user_lock); + mdd->mdd_cl.mc_minrec = mcuc.mcuc_minrec; + mdd->mdd_cl.mc_mintime = mcuc.mcuc_mintime; + spin_unlock(&mdd->mdd_cl.mc_user_lock); } } else { CDEBUG(D_IOCTL, "%s: No entry for user %d\n", diff --git a/lustre/mdd/mdd_dir.c b/lustre/mdd/mdd_dir.c index d48dc83..07777ee 100644 --- a/lustre/mdd/mdd_dir.c +++ b/lustre/mdd/mdd_dir.c @@ -822,6 +822,17 @@ int mdd_changelog_write_rec(const struct lu_env *env, return rc; } +bool mdd_changelog_need_gc(const struct lu_env *env, struct mdd_device *mdd, + struct llog_handle *lgh) +{ + unsigned long free_cat_entries = llog_cat_free_space(lgh); + struct mdd_changelog *mc = &mdd->mdd_cl; + + return free_cat_entries <= mdd->mdd_changelog_min_free_cat_entries || + mdd_changelog_is_too_idle(mdd, mc->mc_minrec, mc->mc_mintime) || + OBD_FAIL_CHECK(OBD_FAIL_FORCE_GC_THREAD); +} + /** Add a changelog entry \a rec to the changelog llog * \param mdd * \param rec @@ -832,10 +843,11 @@ int mdd_changelog_write_rec(const struct lu_env *env, int mdd_changelog_store(const struct lu_env *env, struct mdd_device *mdd, struct llog_changelog_rec *rec, struct thandle *th) { - struct obd_device *obd = mdd2obd_dev(mdd); - struct llog_ctxt *ctxt; - struct thandle *llog_th; - int rc; + struct obd_device *obd = mdd2obd_dev(mdd); + struct llog_ctxt *ctxt; + struct thandle *llog_th; + int rc; + bool need_gc; rec->cr_hdr.lrh_len = llog_data_len(sizeof(*rec) + changelog_rec_varsize(&rec->cr)); @@ -863,20 +875,24 @@ int mdd_changelog_store(const struct lu_env *env, struct mdd_device *mdd, ktime_get_real_seconds() - mdd->mdd_cl.mc_gc_time)) /* save a spin_lock trip */ goto out_put; + + if (OBD_FAIL_PRECHECK(OBD_FAIL_MDS_CHANGELOG_IDX_PUMP)) { + spin_lock(&mdd->mdd_cl.mc_lock); + mdd->mdd_cl.mc_index += cfs_fail_val; + spin_unlock(&mdd->mdd_cl.mc_lock); + } + + need_gc = mdd_changelog_need_gc(env, mdd, ctxt->loc_handle); spin_lock(&mdd->mdd_cl.mc_lock); if (likely(mdd->mdd_changelog_gc && mdd->mdd_cl.mc_gc_task == MDD_CHLG_GC_NONE && ktime_get_real_seconds() - mdd->mdd_cl.mc_gc_time > mdd->mdd_changelog_min_gc_interval)) { - if (unlikely(llog_cat_free_space(ctxt->loc_handle) <= - mdd->mdd_changelog_min_free_cat_entries || - OBD_FAIL_CHECK(OBD_FAIL_FORCE_GC_THREAD))) { - CWARN("%s:%s low on changelog_catalog free entries, " - "starting ChangeLog garbage collection thread\n", + if (unlikely(need_gc)) { + CWARN("%s:%s starting changelog garbage collection\n", obd->obd_name, OBD_FAIL_CHECK(OBD_FAIL_FORCE_GC_THREAD) ? - " simulate" : ""); - + " simulate" : ""); /* indicate further kthread run will occur outside * right after current journal transaction filling has * completed diff --git a/lustre/mdd/mdd_internal.h b/lustre/mdd/mdd_internal.h index 3ed6472..1986d27 100644 --- a/lustre/mdd/mdd_internal.h +++ b/lustre/mdd/mdd_internal.h @@ -97,6 +97,8 @@ struct mdd_changelog { int mc_flags; __u32 mc_proc_mask; /* per-server mask set via parameters */ __u32 mc_current_mask; /* combined global+users */ + __u32 mc_mintime; /* the oldest changelog user time */ + __u64 mc_minrec; /* last known minimal used index */ __u64 mc_index; ktime_t mc_starttime; spinlock_t mc_user_lock; @@ -851,4 +853,15 @@ static inline bool mdd_changelog_enabled(const struct lu_env *env, } } +static inline bool mdd_changelog_is_too_idle(struct mdd_device *mdd, + __u64 cl_rec, __u32 cl_time) +{ + __u64 idle_indexes = mdd->mdd_cl.mc_index - cl_rec; + __u32 idle_time = (__u32)ktime_get_real_seconds() - cl_time; + + return (idle_indexes > mdd->mdd_changelog_max_idle_indexes || + idle_time > mdd->mdd_changelog_max_idle_time || + idle_time * idle_indexes > (24 * 3600ULL << 32)); +} + #endif diff --git a/lustre/mdd/mdd_trans.c b/lustre/mdd/mdd_trans.c index 0520052..a6c5971 100644 --- a/lustre/mdd/mdd_trans.c +++ b/lustre/mdd/mdd_trans.c @@ -75,8 +75,8 @@ int mdd_trans_start(const struct lu_env *env, struct mdd_device *mdd, struct mdd_changelog_gc { struct mdd_device *mcgc_mdd; __u32 mcgc_id; - __u32 mcgc_maxtime; - __u64 mcgc_maxindexes; + __u32 mcgc_mintime; + __u64 mcgc_minrec; char mcgc_name[CHANGELOG_USER_NAMELEN_FULL]; }; @@ -97,45 +97,13 @@ static int mdd_changelog_gc_cb(const struct lu_env *env, rec = container_of(hdr, typeof(*rec), cur_hdr); - /* find oldest idle user, based on last record update/cancel time (new - * behavior), or for old user records, last record index vs current - * ChangeLog index. Late users with old record format will be treated - * first as we assume they could be idle since longer - */ - if (rec->cur_time != 0) { - u32 time_now = (u32)ktime_get_real_seconds(); - timeout_t time_out = rec->cur_time + - mdd->mdd_changelog_max_idle_time; - timeout_t idle_time = time_now - rec->cur_time; - - /* treat oldest idle user first, and if no old format user - * has been already selected - */ - if (time_after32(time_now, time_out) && - idle_time > mcgc->mcgc_maxtime && - mcgc->mcgc_maxindexes == 0) { - mcgc->mcgc_maxtime = idle_time; - mcgc->mcgc_id = rec->cur_id; - mdd_chlg_username(rec, mcgc->mcgc_name, - sizeof(mcgc->mcgc_name)); - } - } else { - /* old user record with no idle time stamp, so use empirical - * method based on its current index/position - */ - __u64 idle_indexes; - - idle_indexes = mdd->mdd_cl.mc_index - rec->cur_endrec; - - /* treat user with the oldest/smallest current index first */ - if (idle_indexes >= mdd->mdd_changelog_max_idle_indexes && - idle_indexes > mcgc->mcgc_maxindexes) { - mcgc->mcgc_maxindexes = idle_indexes; - mcgc->mcgc_id = rec->cur_id; - mdd_chlg_username(rec, mcgc->mcgc_name, - sizeof(mcgc->mcgc_name)); - } - + if (mdd_changelog_is_too_idle(mdd, rec->cur_endrec, rec->cur_time) && + rec->cur_endrec < mcgc->mcgc_minrec) { + mcgc->mcgc_mintime = rec->cur_time; + mcgc->mcgc_minrec = rec->cur_endrec; + mcgc->mcgc_id = rec->cur_id; + mdd_chlg_username(rec, mcgc->mcgc_name, + sizeof(mcgc->mcgc_name)); } RETURN(0); } @@ -156,58 +124,54 @@ static int mdd_chlg_garbage_collect(void *data) mdd2obd_dev(mdd)->obd_name, current->pid); OBD_ALLOC_PTR(env); - if (env == NULL) + if (!env) GOTO(out, rc = -ENOMEM); rc = lu_env_init(env, LCT_MD_THREAD); if (rc) - GOTO(out, rc); + GOTO(out_free, rc); + + ctxt = llog_get_context(mdd2obd_dev(mdd), + LLOG_CHANGELOG_USER_ORIG_CTXT); + if (!ctxt) + GOTO(out_env, rc = -ENXIO); + if (!(ctxt->loc_handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) + GOTO(out_ctxt, rc = -ENXIO); for (;;) { + __u32 time_now = (__u32)ktime_get_real_seconds(); struct mdd_changelog_gc mcgc = { .mcgc_mdd = mdd, - .mcgc_maxtime = 0, - .mcgc_maxindexes = 0, + .mcgc_minrec = mdd->mdd_cl.mc_index, + .mcgc_name = { 0 }, }; - ctxt = llog_get_context(mdd2obd_dev(mdd), - LLOG_CHANGELOG_USER_ORIG_CTXT); - if (ctxt == NULL || - (ctxt->loc_handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) == 0) - GOTO(out_ctxt, rc = -ENXIO); - rc = llog_cat_process(env, ctxt->loc_handle, mdd_changelog_gc_cb, &mcgc, 0, 0); - if (rc != 0 || !mcgc.mcgc_name[0]) + if (rc) + GOTO(out_ctxt, rc); + + if (!mcgc.mcgc_name[0]) break; - llog_ctxt_put(ctxt); - if (mcgc.mcgc_maxindexes != 0) - CWARN("%s: Force deregister of ChangeLog user %s idle with more than %llu unprocessed records\n", - mdd2obd_dev(mdd)->obd_name, mcgc.mcgc_name, - mcgc.mcgc_maxindexes); - else - CWARN("%s: Force deregister of ChangeLog user %s idle since more than %us\n", - mdd2obd_dev(mdd)->obd_name, mcgc.mcgc_name, - mcgc.mcgc_maxtime); + CWARN("%s: force deregister of changelog user %s idle for %us with %llu unprocessed records\n", + mdd2obd_dev(mdd)->obd_name, mcgc.mcgc_name, + time_now - mcgc.mcgc_mintime, + mdd->mdd_cl.mc_index - mcgc.mcgc_minrec); mdd_changelog_user_purge(env, mdd, mcgc.mcgc_id); if (kthread_should_stop()) - GOTO(out_env, rc = 0); + GOTO(out_ctxt, rc = 0); } - + EXIT; out_ctxt: - if (ctxt != NULL) - llog_ctxt_put(ctxt); - + llog_ctxt_put(ctxt); out_env: lu_env_fini(env); - GOTO(out, rc); +out_free: + OBD_FREE_PTR(env); out: - if (env) - OBD_FREE_PTR(env); - spin_lock(&mdd->mdd_cl.mc_lock); mdd->mdd_cl.mc_gc_task = MDD_CHLG_GC_NONE; spin_unlock(&mdd->mdd_cl.mc_lock); diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index e6396f3..9edc416 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -16050,14 +16050,11 @@ run_test 160f "changelog garbage collect (timestamped users)" test_160g() { remote_mds_nodsh && skip "remote MDS with nodsh" - [[ $MDS1_VERSION -ge $(version_code 2.10.56) ]] || - skip "Need MDS version at least 2.10.56" + [[ $MDS1_VERSION -ge $(version_code 2.14.55) ]] || + skip "Need MDS version at least 2.14.55" local mdts=$(comma_list $(mdts_nodes)) - #define OBD_FAIL_TIME_IN_CHLOG_USER 0x1314 - do_nodes $mdts $LCTL set_param fail_loc=0x1314 - # Create a user changelog_register || error "first changelog_register failed" changelog_register || error "second changelog_register failed" @@ -16082,10 +16079,9 @@ test_160g() { (( $nbcl > 0 )) || error "no changelogs found" # reduce the max_idle_indexes value to make sure we exceed it - for param in "changelog_max_idle_indexes=1" \ + for param in "changelog_max_idle_indexes=2" \ "changelog_gc=1" \ - "changelog_min_gc_interval=2" \ - "changelog_min_free_cat_entries=3"; do + "changelog_min_gc_interval=2"; do local MDT0=$(facet_svc $SINGLEMDS) local var="${param%=*}" local old=$(do_facet mds1 "$LCTL get_param -n mdd.$MDT0.$var") @@ -16095,10 +16091,6 @@ test_160g() { error "unable to set mdd.*.$param" done - # simulate changelog catalog almost full - #define OBD_FAIL_CAT_FREE_RECORDS 0x1313 - do_nodes $mdts "$LCTL set_param fail_loc=0x1313 fail_val=3" - local start=$SECONDS for i in $(seq $MDSCOUNT); do cl_users=(${CL_USERS[mds$i]}) @@ -16106,38 +16098,37 @@ test_160g() { cl_user2[mds$i]="${cl_users[1]}" [ -n "${cl_user1[mds$i]}" ] || - error "mds$i: no user registered" + error "mds$i: user1 is not registered" [ -n "${cl_user2[mds$i]}" ] || error "mds$i: only ${cl_user1[mds$i]} is registered" user_rec1=$(changelog_user_rec mds$i ${cl_user1[mds$i]}) [ -n "$user_rec1" ] || - error "mds$i: User ${cl_user1[mds$i]} not registered" + error "mds$i: user1 ${cl_user1[mds$i]} not found" __changelog_clear mds$i ${cl_user1[mds$i]} +2 user_rec2=$(changelog_user_rec mds$i ${cl_user1[mds$i]}) [ -n "$user_rec2" ] || - error "mds$i: User ${cl_user1[mds$i]} not registered" - echo "mds$i: verifying user ${cl_user1[mds$i]} clear: " \ + error "mds$i: user1 ${cl_user1[mds$i]} not found (2)" + echo "mds$i: verifying user1 ${cl_user1[mds$i]} clear: " \ "$user_rec1 + 2 == $user_rec2" [ $((user_rec1 + 2)) == $user_rec2 ] || - error "mds$i: user ${cl_user1[mds$i]} index expected " \ - "$user_rec1 + 2, but is $user_rec2" + error "mds$i: user1 ${cl_user1[mds$i]} index " \ + "expected $user_rec1 + 2, but is $user_rec2" user_rec2=$(changelog_user_rec mds$i ${cl_user2[mds$i]}) [ -n "$user_rec2" ] || - error "mds$i: User ${cl_user2[mds$i]} not registered" + error "mds$i: user2 ${cl_user2[mds$i]} not found" [ $user_rec1 == $user_rec2 ] || - error "mds$i: user ${cl_user2[mds$i]} index expected " \ - "$user_rec1, but is $user_rec2" + error "mds$i: user2 ${cl_user2[mds$i]} index " \ + "expected $user_rec1, but is $user_rec2" done # ensure we are past the previous changelog_min_gc_interval set above local sleep2=$((start + 2 - SECONDS)) (( sleep2 > 0 )) && echo "sleep $sleep2 for interval" && sleep $sleep2 - # Generate one more changelog to trigger GC at fail_loc for cl_user2. # cl_user1 should be OK because it recently processed records. for ((i = 0; i < MDSCOUNT; i++)); do - $LFS mkdir -i $i $DIR/$tdir/d$i.3 $DIR/$tdir/d$i.4 || + $LFS mkdir -i $i $DIR/$tdir/d$i.3 || error "create $DIR/$tdir/d$i.3 failed" done @@ -16151,15 +16142,15 @@ test_160g() { for (( i = 1; i <= MDSCOUNT; i++ )); do # check cl_user1 still registered changelog_users mds$i | grep -q "${cl_user1[mds$i]}" || - error "mds$i: User ${cl_user1[mds$i]} not registered" + error "mds$i: user1 ${cl_user1[mds$i]} not found (3)" # check cl_user2 unregistered changelog_users mds$i | grep -q "${cl_user2[mds$i]}" && - error "mds$i: User ${cl_user2[mds$i]} still registered" + error "mds$i: user2 ${cl_user2[mds$i]} is registered" # check changelogs are present and starting at $user_rec1 + 1 user_rec1=$(changelog_user_rec mds$i ${cl_user1[mds$i]}) [ -n "$user_rec1" ] || - error "mds$i: User ${cl_user1[mds$i]} not registered" + error "mds$i: user1 ${cl_user1[mds$i]} not found (4)" first_rec=$($LFS changelog $(facet_svc mds$i) | awk '{ print $1; exit; }') @@ -16168,7 +16159,7 @@ test_160g() { error "mds$i: rec $first_rec != $user_rec1 + 1" done } -run_test 160g "changelog garbage collect (old users)" +run_test 160g "changelog garbage collect on idle records" test_160h() { remote_mds_nodsh && skip "remote MDS with nodsh" && return @@ -16744,6 +16735,89 @@ test_160q() { } run_test 160q "changelog effective mask is DEFMASK if not set" +test_160s() { + remote_mds_nodsh && skip "remote MDS with nodsh" + (( $MDS1_VERSION >= $(version_code 2.14.55) )) || + skip "Need MDS version at least 2.14.55" + + local mdts=$(comma_list $(mdts_nodes)) + + #define OBD_FAIL_TIME_IN_CHLOG_USER 0x1314 + do_nodes $mdts $LCTL set_param fail_loc=0x1314 \ + fail_val=$((24 * 3600 * 10)) + + # Create a user which is 10 days old + changelog_register || error "first changelog_register failed" + local cl_users + declare -A cl_user1 + local i + + # generate some changelog records to accumulate on each MDT + # use all_char because created files should be evenly distributed + test_mkdir -c $MDSCOUNT -H all_char $DIR/$tdir || + error "test_mkdir $tdir failed" + for ((i = 0; i < MDSCOUNT; i++)); do + $LFS mkdir -i $i $DIR/$tdir/d$i.1 $DIR/$tdir/d$i.2 || + error "create $DIR/$tdir/d$i.1 failed" + done + + # check changelogs have been generated + local nbcl=$(changelog_dump | wc -l) + (( nbcl > 0 )) || error "no changelogs found" + + # reduce the max_idle_indexes value to make sure we exceed it + for param in "changelog_max_idle_indexes=2097446912" \ + "changelog_max_idle_time=2592000" \ + "changelog_gc=1" \ + "changelog_min_gc_interval=2"; do + local MDT0=$(facet_svc $SINGLEMDS) + local var="${param%=*}" + local old=$(do_facet mds1 "$LCTL get_param -n mdd.$MDT0.$var") + + stack_trap "do_nodes $mdts $LCTL set_param mdd.*.$var=$old" EXIT + do_nodes $mdts $LCTL set_param mdd.*.$param || + error "unable to set mdd.*.$param" + done + + local start=$SECONDS + for i in $(seq $MDSCOUNT); do + cl_users=(${CL_USERS[mds$i]}) + cl_user1[mds$i]="${cl_users[0]}" + + [[ -n "${cl_user1[mds$i]}" ]] || + error "mds$i: no user registered" + done + + #define OBD_FAIL_MDS_CHANGELOG_IDX_PUMP 0x16d + do_nodes $mdts $LCTL set_param fail_loc=0x16d fail_val=500000000 + + # ensure we are past the previous changelog_min_gc_interval set above + local sleep2=$((start + 2 - SECONDS)) + (( sleep2 > 0 )) && echo "sleep $sleep2 for interval" && sleep $sleep2 + + # Generate one more changelog to trigger GC + for ((i = 0; i < MDSCOUNT; i++)); do + $LFS mkdir -i $i $DIR/$tdir/d$i.3 $DIR/$tdir/d$i.4 || + error "create $DIR/$tdir/d$i.3 failed" + done + + # ensure gc thread is done + for node in $(mdts_nodes); do + wait_update $node "pgrep chlg_gc_thread" "" 20 || + error "$node: GC-thread not done" + done + + do_nodes $mdts $LCTL set_param fail_loc=0 + + for (( i = 1; i <= MDSCOUNT; i++ )); do + # check cl_user1 is purged + changelog_users mds$i | grep -q "${cl_user1[mds$i]}" && + error "mds$i: User ${cl_user1[mds$i]} is registered" + done + return 0 +} +run_test 160s "changelog garbage collect on idle records * time" + test_161a() { [ $PARALLEL == "yes" ] && skip "skip parallel run" -- 1.8.3.1