From bec1334954a73ed668fad409e8c728f9dfd6bb99 Mon Sep 17 00:00:00 2001 From: Mikhail Pershin Date: Mon, 7 Feb 2022 13:12:29 +0300 Subject: [PATCH] LU-15524 mdd: trigger changelog GC by free space if amount of space consumed by changelog become comparable with system free space then start emergency GC for changelog by purging the oldest user Such behavior is enabled by default and can be disabled via mdd_changelog_free_space_gc parameter Test 160t is added to sanity.sh Signed-off-by: Mikhail Pershin Change-Id: Ia63cc71e708b0f10cdf54f45f0809c0e86950101 Reviewed-on: https://review.whamcloud.com/46467 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Emoly Liu Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- lustre/include/obd_support.h | 1 + lustre/mdd/mdd_device.c | 4 ++ lustre/mdd/mdd_dir.c | 110 ++++++++++++++++++++++++++++++++++++++++++- lustre/mdd/mdd_internal.h | 9 ++++ lustre/mdd/mdd_lproc.c | 29 ++++++++++++ lustre/mdd/mdd_trans.c | 9 +++- lustre/tests/sanity.sh | 76 ++++++++++++++++++++++++++++++ 7 files changed, 234 insertions(+), 4 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 1da52a7..41cf729 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -269,6 +269,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_MDS_STRIPE_FID 0x189 #define OBD_FAIL_MDS_LINK_RENAME_RACE 0x18a #define OBD_FAIL_MDS_HSM_RESTORE_RACE 0x18b +#define OBD_FAIL_MDS_CHANGELOG_ENOSPC 0x18c /* OI scrub */ #define OBD_FAIL_OSD_SCRUB_DELAY 0x190 diff --git a/lustre/mdd/mdd_device.c b/lustre/mdd/mdd_device.c index 94fb1f6..99967a0 100644 --- a/lustre/mdd/mdd_device.c +++ b/lustre/mdd/mdd_device.c @@ -149,6 +149,10 @@ static int mdd_init0(const struct lu_env *env, struct mdd_device *mdd, mdd->mdd_sync_permission = 1; /* enable changelog garbage collection */ mdd->mdd_changelog_gc = 1; + /* enable changelog cleanup due to lack of space */ + mdd->mdd_changelog_free_space_gc = true; + /* set when emergency GC is started */ + mdd->mdd_changelog_emrg_gc = false; /* with a significant amount of idle time */ mdd->mdd_changelog_max_idle_time = CHLOG_MAX_IDLE_TIME; /* or a significant amount of late indexes */ diff --git a/lustre/mdd/mdd_dir.c b/lustre/mdd/mdd_dir.c index 1b83614..3d1e7a7 100644 --- a/lustre/mdd/mdd_dir.c +++ b/lustre/mdd/mdd_dir.c @@ -821,13 +821,119 @@ int mdd_changelog_write_rec(const struct lu_env *env, return rc; } +/** + * Checks that changelog consumes safe amount of space comparing + * with FS free space + * + * \param env - current lu_env + * \param mdd - current MDD device + * \param lgh - changelog catalog llog handle + * \param estimate - get exact llog size or estimate it. + * + * \retval true/false + */ +bool mdd_changelog_is_space_safe(const struct lu_env *env, + struct mdd_device *mdd, + struct llog_handle *lgh, + bool estimate) +{ + struct obd_statfs sfs; + unsigned long long free_space_limit; + unsigned long long llog_size; + int rc; + + rc = dt_statfs(env, mdd->mdd_bottom, &sfs); + if (rc) + /* check is ignored if OSD is not healthy for any reason */ + return true; + + /* + * if changelog consumes more than 1/4 of available space then start + * emergency cleanup. + */ + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_CHANGELOG_ENOSPC)) + free_space_limit = cfs_fail_val; + else + free_space_limit = (sfs.os_bfree * sfs.os_bsize) >> 2; + + /* if \estimate parameter is used then calculate llog size from + * number of used catalog entries and plain llog maximum size. + * Plain llog maximum size if set as 1/64 of FS free space limited + * by 128MB as maximum and 2MB as minimum, see llog_cat_new_log() + * Estimation helps to avoid full llog processing to get exact size + * by llog_cat_size(). + */ + if (estimate) { + /* use 1/64 of FS size but keep it between 2MB and 128MB */ + llog_size = clamp_t(unsigned long long, + (sfs.os_blocks * sfs.os_bsize) >> 6, + 2 << 20, 128 << 20); + /* + * llog_cat_free_space() gives free slots, we need occupied, + * so subtruct free from total slots minus one for header + */ + llog_size *= LLOG_HDR_BITMAP_SIZE(lgh->lgh_hdr) - 1 - + llog_cat_free_space(lgh); + } else { + /* get exact llog size */ + llog_size = llog_cat_size(env, lgh); + } + CDEBUG(D_HA, "%s:%s changelog size is %lluMB, space limit is %lluMB\n", + mdd2obd_dev(mdd)->obd_name, estimate ? " estimated" : "", + llog_size >> 20, free_space_limit >> 20); + + if (llog_size > free_space_limit) { + CWARN("%s: changelog uses %lluMB with %lluMB space limit\n", + mdd2obd_dev(mdd)->obd_name, llog_size >> 20, + free_space_limit >> 20); + return false; + } + + return true; +} + +/** + * Checks if there is enough space in changelog itself and in FS and force + * emergency changelog cleanup if needed. It will purge users one by one + * from the oldest one while emergency conditions are true. + * + * \param env - current lu_env + * \param mdd - current MDD device + * \param lgh - changelog catalog llog handle + * + * \retval true if emergency cleanup is needed for changelog + */ +static bool mdd_changelog_emrg_cleanup(const struct lu_env *env, + struct mdd_device *mdd, + struct llog_handle *lgh) +{ + unsigned long free_entries = llog_cat_free_space(lgh); + + /* free space GC is disabled or is in progress already */ + if (!mdd->mdd_changelog_free_space_gc || mdd->mdd_changelog_emrg_gc) + return false; + + if (free_entries <= mdd->mdd_changelog_min_free_cat_entries) { + CWARN("%s: changelog has only %lu free catalog entries\n", + mdd2obd_dev(mdd)->obd_name, free_entries); + mdd->mdd_changelog_emrg_gc = true; + return true; + } + + if (!mdd_changelog_is_space_safe(env, mdd, lgh, true)) { + mdd->mdd_changelog_emrg_gc = true; + return true; + } + + return false; +} + bool mdd_changelog_need_gc(const struct lu_env *env, struct mdd_device *mdd, struct llog_handle *lgh) { - unsigned long free_cat_entries = llog_cat_free_space(lgh); struct mdd_changelog *mc = &mdd->mdd_cl; - return free_cat_entries <= mdd->mdd_changelog_min_free_cat_entries || + return mdd_changelog_emrg_cleanup(env, mdd, lgh) || mdd_changelog_is_too_idle(mdd, mc->mc_minrec, mc->mc_mintime) || OBD_FAIL_CHECK(OBD_FAIL_FORCE_GC_THREAD); } diff --git a/lustre/mdd/mdd_internal.h b/lustre/mdd/mdd_internal.h index a8c9e40..5918f8c 100644 --- a/lustre/mdd/mdd_internal.h +++ b/lustre/mdd/mdd_internal.h @@ -144,6 +144,10 @@ struct mdd_device { struct dt_object *mdd_orphans; /* PENDING directory */ struct mdd_changelog mdd_cl; unsigned int mdd_changelog_gc; + /* emrg GC is in progress */ + bool mdd_changelog_emrg_gc; + /* don't use GC by free space */ + bool mdd_changelog_free_space_gc; time64_t mdd_changelog_max_idle_time; unsigned long mdd_changelog_max_idle_indexes; time64_t mdd_changelog_min_gc_interval; @@ -872,4 +876,9 @@ static inline bool mdd_changelog_is_too_idle(struct mdd_device *mdd, idle_time * idle_indexes > (24 * 3600ULL << 32)); } +bool mdd_changelog_is_space_safe(const struct lu_env *env, + struct mdd_device *mdd, + struct llog_handle *lgh, + bool estimate); + #endif diff --git a/lustre/mdd/mdd_lproc.c b/lustre/mdd/mdd_lproc.c index e207487..cbae6c2 100644 --- a/lustre/mdd/mdd_lproc.c +++ b/lustre/mdd/mdd_lproc.c @@ -337,6 +337,34 @@ static ssize_t changelog_gc_store(struct kobject *kobj, } LUSTRE_RW_ATTR(changelog_gc); +static ssize_t changelog_free_space_gc_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct mdd_device *mdd = container_of(kobj, struct mdd_device, + mdd_kobj); + + return sprintf(buf, "%u\n", mdd->mdd_changelog_free_space_gc); +} + +static ssize_t changelog_free_space_gc_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct mdd_device *mdd = container_of(kobj, struct mdd_device, + mdd_kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + mdd->mdd_changelog_free_space_gc = val; + + return count; +} +LUSTRE_RW_ATTR(changelog_free_space_gc); + static ssize_t changelog_max_idle_time_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -737,6 +765,7 @@ static struct attribute *mdd_attrs[] = { &lustre_attr_atime_diff.attr, &lustre_attr_changelog_size.attr, &lustre_attr_changelog_gc.attr, + &lustre_attr_changelog_free_space_gc.attr, &lustre_attr_changelog_max_idle_time.attr, &lustre_attr_changelog_max_idle_indexes.attr, &lustre_attr_changelog_min_gc_interval.attr, diff --git a/lustre/mdd/mdd_trans.c b/lustre/mdd/mdd_trans.c index 6d24c3c..c2581ad 100644 --- a/lustre/mdd/mdd_trans.c +++ b/lustre/mdd/mdd_trans.c @@ -97,8 +97,9 @@ static int mdd_changelog_gc_cb(const struct lu_env *env, rec = container_of(hdr, typeof(*rec), cur_hdr); - if (mdd_changelog_is_too_idle(mdd, rec->cur_endrec, rec->cur_time) && - rec->cur_endrec < mcgc->mcgc_minrec) { + if (rec->cur_endrec < mcgc->mcgc_minrec && + (mdd->mdd_changelog_emrg_gc || + mdd_changelog_is_too_idle(mdd, rec->cur_endrec, rec->cur_time))) { mcgc->mcgc_mintime = rec->cur_time; mcgc->mcgc_minrec = rec->cur_endrec; mcgc->mcgc_id = rec->cur_id; @@ -161,6 +162,10 @@ static int mdd_chlg_garbage_collect(void *data) mdd_changelog_user_purge(env, mdd, mcgc.mcgc_id); + if (mdd->mdd_changelog_emrg_gc && + mdd_changelog_is_space_safe(env, mdd, ctxt->loc_handle, 0)) + mdd->mdd_changelog_emrg_gc = false; + if (kthread_should_stop()) GOTO(out_ctxt, rc = 0); } diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index ed1da80..978f3c1 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -17186,6 +17186,82 @@ test_160s() { } run_test 160s "changelog garbage collect on idle records * time" +test_160t() { + remote_mds_nodsh && skip "remote MDS with nodsh" + (( $MDS1_VERSION >= $(version_code 2.15.50) )) || + skip "Need MDS version at least 2.15.50" + + local MDT0=$(facet_svc $SINGLEMDS) + local cl_users + local cl_user1 + local cl_user2 + local start + + changelog_register --user user1 -m all || + error "user1 failed to register" + + mkdir_on_mdt0 $DIR/$tdir + # create default overstripe to maximize changelog size + $LFS setstripe -C 8 $DIR/$tdir || error "setstripe failed" + createmany -o $DIR/$tdir/u1_ 2000 || error "createmany for user1 failed" + llog_size1=$(do_facet mds1 $LCTL get_param -n mdd.$MDT0.changelog_size) + + # user2 consumes less records so less space + changelog_register --user user2 || error "user2 failed to register" + createmany -o $DIR/$tdir/u2_ 500 || error "createmany for user2 failed" + llog_size2=$(do_facet mds1 $LCTL get_param -n mdd.$MDT0.changelog_size) + + # check changelogs have been generated + local nbcl=$(changelog_dump | wc -l) + (( nbcl > 0 )) || error "no changelogs found" + + # reduce the changelog_min_gc_interval to force check + for param in "changelog_gc=1" "changelog_min_gc_interval=2"; do + local var="${param%=*}" + local old=$(do_facet mds1 "$LCTL get_param -n mdd.$MDT0.$var") + + stack_trap "do_facet mds1 $LCTL set_param mdd.$MDT0.$var=$old" + do_facet mds1 $LCTL set_param mdd.$MDT0.$param || + error "unable to set mdd.*.$param" + done + + start=$SECONDS + cl_users=(${CL_USERS[mds1]}) + cl_user1="${cl_users[0]}" + cl_user2="${cl_users[1]}" + + [[ -n $cl_user1 ]] || + error "mds1: user #1 isn't registered" + [[ -n $cl_user2 ]] || + error "mds1: user #2 isn't registered" + + # ensure we are past the previous changelog_min_gc_interval set above + local sleep2=$((start + 2 - SECONDS)) + (( sleep2 > 0 )) && echo "sleep $sleep2 for interval" && sleep $sleep2 + + #define OBD_FAIL_MDS_CHANGELOG_ENOSPC 0x018c + do_facet mds1 $LCTL set_param fail_loc=0x018c \ + fail_val=$(((llog_size1 + llog_size2) / 2)) + + # Generate more changelog to trigger GC + createmany -o $DIR/$tdir/u3_ 4 || + error "create failed for more files" + + # ensure gc thread is done + wait_update_facet mds1 "pgrep chlg_gc_thread" "" 20 || + error "mds1: GC-thread not done" + + do_facet mds1 $LCTL set_param fail_loc=0 + + # check cl_user1 is purged + changelog_users mds1 | grep -q "$cl_user1" && + error "User $cl_user1 is registered" + # check cl_user2 is not purged + changelog_users mds1 | grep -q "$cl_user2" || + error "User $cl_user2 is not registered" +} +run_test 160t "changelog garbage collect on lack of space" + test_161a() { [ $PARALLEL == "yes" ] && skip "skip parallel run" -- 1.8.3.1