#define OBD_FAIL_MDS_STRIPE_FID 0x189
#define OBD_FAIL_MDS_LINK_RENAME_RACE 0x18a
#define OBD_FAIL_MDS_HSM_RESTORE_RACE 0x18b
+#define OBD_FAIL_MDS_CHANGELOG_ENOSPC 0x18c
/* OI scrub */
#define OBD_FAIL_OSD_SCRUB_DELAY 0x190
mdd->mdd_sync_permission = 1;
/* enable changelog garbage collection */
mdd->mdd_changelog_gc = 1;
+ /* enable changelog cleanup due to lack of space */
+ mdd->mdd_changelog_free_space_gc = true;
+ /* set when emergency GC is started */
+ mdd->mdd_changelog_emrg_gc = false;
/* with a significant amount of idle time */
mdd->mdd_changelog_max_idle_time = CHLOG_MAX_IDLE_TIME;
/* or a significant amount of late indexes */
return rc;
}
+/**
+ * Checks that changelog consumes safe amount of space comparing
+ * with FS free space
+ *
+ * \param env - current lu_env
+ * \param mdd - current MDD device
+ * \param lgh - changelog catalog llog handle
+ * \param estimate - get exact llog size or estimate it.
+ *
+ * \retval true/false
+ */
+bool mdd_changelog_is_space_safe(const struct lu_env *env,
+ struct mdd_device *mdd,
+ struct llog_handle *lgh,
+ bool estimate)
+{
+ struct obd_statfs sfs;
+ unsigned long long free_space_limit;
+ unsigned long long llog_size;
+ int rc;
+
+ rc = dt_statfs(env, mdd->mdd_bottom, &sfs);
+ if (rc)
+ /* check is ignored if OSD is not healthy for any reason */
+ return true;
+
+ /*
+ * if changelog consumes more than 1/4 of available space then start
+ * emergency cleanup.
+ */
+ if (OBD_FAIL_CHECK(OBD_FAIL_MDS_CHANGELOG_ENOSPC))
+ free_space_limit = cfs_fail_val;
+ else
+ free_space_limit = (sfs.os_bfree * sfs.os_bsize) >> 2;
+
+ /* if \estimate parameter is used then calculate llog size from
+ * number of used catalog entries and plain llog maximum size.
+ * Plain llog maximum size if set as 1/64 of FS free space limited
+ * by 128MB as maximum and 2MB as minimum, see llog_cat_new_log()
+ * Estimation helps to avoid full llog processing to get exact size
+ * by llog_cat_size().
+ */
+ if (estimate) {
+ /* use 1/64 of FS size but keep it between 2MB and 128MB */
+ llog_size = clamp_t(unsigned long long,
+ (sfs.os_blocks * sfs.os_bsize) >> 6,
+ 2 << 20, 128 << 20);
+ /*
+ * llog_cat_free_space() gives free slots, we need occupied,
+ * so subtruct free from total slots minus one for header
+ */
+ llog_size *= LLOG_HDR_BITMAP_SIZE(lgh->lgh_hdr) - 1 -
+ llog_cat_free_space(lgh);
+ } else {
+ /* get exact llog size */
+ llog_size = llog_cat_size(env, lgh);
+ }
+ CDEBUG(D_HA, "%s:%s changelog size is %lluMB, space limit is %lluMB\n",
+ mdd2obd_dev(mdd)->obd_name, estimate ? " estimated" : "",
+ llog_size >> 20, free_space_limit >> 20);
+
+ if (llog_size > free_space_limit) {
+ CWARN("%s: changelog uses %lluMB with %lluMB space limit\n",
+ mdd2obd_dev(mdd)->obd_name, llog_size >> 20,
+ free_space_limit >> 20);
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * Checks if there is enough space in changelog itself and in FS and force
+ * emergency changelog cleanup if needed. It will purge users one by one
+ * from the oldest one while emergency conditions are true.
+ *
+ * \param env - current lu_env
+ * \param mdd - current MDD device
+ * \param lgh - changelog catalog llog handle
+ *
+ * \retval true if emergency cleanup is needed for changelog
+ */
+static bool mdd_changelog_emrg_cleanup(const struct lu_env *env,
+ struct mdd_device *mdd,
+ struct llog_handle *lgh)
+{
+ unsigned long free_entries = llog_cat_free_space(lgh);
+
+ /* free space GC is disabled or is in progress already */
+ if (!mdd->mdd_changelog_free_space_gc || mdd->mdd_changelog_emrg_gc)
+ return false;
+
+ if (free_entries <= mdd->mdd_changelog_min_free_cat_entries) {
+ CWARN("%s: changelog has only %lu free catalog entries\n",
+ mdd2obd_dev(mdd)->obd_name, free_entries);
+ mdd->mdd_changelog_emrg_gc = true;
+ return true;
+ }
+
+ if (!mdd_changelog_is_space_safe(env, mdd, lgh, true)) {
+ mdd->mdd_changelog_emrg_gc = true;
+ return true;
+ }
+
+ return false;
+}
+
bool mdd_changelog_need_gc(const struct lu_env *env, struct mdd_device *mdd,
struct llog_handle *lgh)
{
- unsigned long free_cat_entries = llog_cat_free_space(lgh);
struct mdd_changelog *mc = &mdd->mdd_cl;
- return free_cat_entries <= mdd->mdd_changelog_min_free_cat_entries ||
+ return mdd_changelog_emrg_cleanup(env, mdd, lgh) ||
mdd_changelog_is_too_idle(mdd, mc->mc_minrec, mc->mc_mintime) ||
OBD_FAIL_CHECK(OBD_FAIL_FORCE_GC_THREAD);
}
struct dt_object *mdd_orphans; /* PENDING directory */
struct mdd_changelog mdd_cl;
unsigned int mdd_changelog_gc;
+ /* emrg GC is in progress */
+ bool mdd_changelog_emrg_gc;
+ /* don't use GC by free space */
+ bool mdd_changelog_free_space_gc;
time64_t mdd_changelog_max_idle_time;
unsigned long mdd_changelog_max_idle_indexes;
time64_t mdd_changelog_min_gc_interval;
idle_time * idle_indexes > (24 * 3600ULL << 32));
}
+bool mdd_changelog_is_space_safe(const struct lu_env *env,
+ struct mdd_device *mdd,
+ struct llog_handle *lgh,
+ bool estimate);
+
#endif
}
LUSTRE_RW_ATTR(changelog_gc);
+static ssize_t changelog_free_space_gc_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct mdd_device *mdd = container_of(kobj, struct mdd_device,
+ mdd_kobj);
+
+ return sprintf(buf, "%u\n", mdd->mdd_changelog_free_space_gc);
+}
+
+static ssize_t changelog_free_space_gc_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buffer, size_t count)
+{
+ struct mdd_device *mdd = container_of(kobj, struct mdd_device,
+ mdd_kobj);
+ bool val;
+ int rc;
+
+ rc = kstrtobool(buffer, &val);
+ if (rc)
+ return rc;
+
+ mdd->mdd_changelog_free_space_gc = val;
+
+ return count;
+}
+LUSTRE_RW_ATTR(changelog_free_space_gc);
+
static ssize_t changelog_max_idle_time_show(struct kobject *kobj,
struct attribute *attr,
char *buf)
&lustre_attr_atime_diff.attr,
&lustre_attr_changelog_size.attr,
&lustre_attr_changelog_gc.attr,
+ &lustre_attr_changelog_free_space_gc.attr,
&lustre_attr_changelog_max_idle_time.attr,
&lustre_attr_changelog_max_idle_indexes.attr,
&lustre_attr_changelog_min_gc_interval.attr,
rec = container_of(hdr, typeof(*rec), cur_hdr);
- if (mdd_changelog_is_too_idle(mdd, rec->cur_endrec, rec->cur_time) &&
- rec->cur_endrec < mcgc->mcgc_minrec) {
+ if (rec->cur_endrec < mcgc->mcgc_minrec &&
+ (mdd->mdd_changelog_emrg_gc ||
+ mdd_changelog_is_too_idle(mdd, rec->cur_endrec, rec->cur_time))) {
mcgc->mcgc_mintime = rec->cur_time;
mcgc->mcgc_minrec = rec->cur_endrec;
mcgc->mcgc_id = rec->cur_id;
mdd_changelog_user_purge(env, mdd, mcgc.mcgc_id);
+ if (mdd->mdd_changelog_emrg_gc &&
+ mdd_changelog_is_space_safe(env, mdd, ctxt->loc_handle, 0))
+ mdd->mdd_changelog_emrg_gc = false;
+
if (kthread_should_stop())
GOTO(out_ctxt, rc = 0);
}
}
run_test 160s "changelog garbage collect on idle records * time"
+test_160t() {
+ remote_mds_nodsh && skip "remote MDS with nodsh"
+ (( $MDS1_VERSION >= $(version_code 2.15.50) )) ||
+ skip "Need MDS version at least 2.15.50"
+
+ local MDT0=$(facet_svc $SINGLEMDS)
+ local cl_users
+ local cl_user1
+ local cl_user2
+ local start
+
+ changelog_register --user user1 -m all ||
+ error "user1 failed to register"
+
+ mkdir_on_mdt0 $DIR/$tdir
+ # create default overstripe to maximize changelog size
+ $LFS setstripe -C 8 $DIR/$tdir || error "setstripe failed"
+ createmany -o $DIR/$tdir/u1_ 2000 || error "createmany for user1 failed"
+ llog_size1=$(do_facet mds1 $LCTL get_param -n mdd.$MDT0.changelog_size)
+
+ # user2 consumes less records so less space
+ changelog_register --user user2 || error "user2 failed to register"
+ createmany -o $DIR/$tdir/u2_ 500 || error "createmany for user2 failed"
+ llog_size2=$(do_facet mds1 $LCTL get_param -n mdd.$MDT0.changelog_size)
+
+ # check changelogs have been generated
+ local nbcl=$(changelog_dump | wc -l)
+ (( nbcl > 0 )) || error "no changelogs found"
+
+ # reduce the changelog_min_gc_interval to force check
+ for param in "changelog_gc=1" "changelog_min_gc_interval=2"; do
+ local var="${param%=*}"
+ local old=$(do_facet mds1 "$LCTL get_param -n mdd.$MDT0.$var")
+
+ stack_trap "do_facet mds1 $LCTL set_param mdd.$MDT0.$var=$old"
+ do_facet mds1 $LCTL set_param mdd.$MDT0.$param ||
+ error "unable to set mdd.*.$param"
+ done
+
+ start=$SECONDS
+ cl_users=(${CL_USERS[mds1]})
+ cl_user1="${cl_users[0]}"
+ cl_user2="${cl_users[1]}"
+
+ [[ -n $cl_user1 ]] ||
+ error "mds1: user #1 isn't registered"
+ [[ -n $cl_user2 ]] ||
+ error "mds1: user #2 isn't registered"
+
+ # ensure we are past the previous changelog_min_gc_interval set above
+ local sleep2=$((start + 2 - SECONDS))
+ (( sleep2 > 0 )) && echo "sleep $sleep2 for interval" && sleep $sleep2
+
+ #define OBD_FAIL_MDS_CHANGELOG_ENOSPC 0x018c
+ do_facet mds1 $LCTL set_param fail_loc=0x018c \
+ fail_val=$(((llog_size1 + llog_size2) / 2))
+
+ # Generate more changelog to trigger GC
+ createmany -o $DIR/$tdir/u3_ 4 ||
+ error "create failed for more files"
+
+ # ensure gc thread is done
+ wait_update_facet mds1 "pgrep chlg_gc_thread" "" 20 ||
+ error "mds1: GC-thread not done"
+
+ do_facet mds1 $LCTL set_param fail_loc=0
+
+ # check cl_user1 is purged
+ changelog_users mds1 | grep -q "$cl_user1" &&
+ error "User $cl_user1 is registered"
+ # check cl_user2 is not purged
+ changelog_users mds1 | grep -q "$cl_user2" ||
+ error "User $cl_user2 is not registered"
+}
+run_test 160t "changelog garbage collect on lack of space"
+
test_161a() {
[ $PARALLEL == "yes" ] && skip "skip parallel run"