#define OBD_FAIL_CAT_FREE_RECORDS 0x1313
#define OBD_FAIL_TIME_IN_CHLOG_USER 0x1314
#define CFS_FAIL_CHLOG_USER_REG_UNREG_RACE 0x1315
+#define OBD_FAIL_FORCE_GC_THREAD 0x1316
#define OBD_FAIL_LLITE 0x1400
#define OBD_FAIL_LLITE_FAULT_TRUNC_RACE 0x1401
mdd->mdd_changelog_max_idle_indexes = CHLOG_MAX_IDLE_INDEXES;
/* with a reasonable interval between each check */
mdd->mdd_changelog_min_gc_interval = CHLOG_MIN_GC_INTERVAL;
- /* with a very few number of free entries */
+ /* with a very few number of free catalog entries */
mdd->mdd_changelog_min_free_cat_entries = CHLOG_MIN_FREE_CAT_ENTRIES;
dt_conf_get(env, mdd->mdd_child, &mdd->mdd_dt_conf);
return LLOG_PROC_BREAK;
}
+struct changelog_orphan_data {
+ __u64 index;
+ struct mdd_device *mdd;
+};
+
+/* find oldest changelog record index */
+static int changelog_detect_orphan_cb(const struct lu_env *env,
+ struct llog_handle *llh,
+ struct llog_rec_hdr *hdr, void *data)
+{
+ struct mdd_device *mdd = ((struct changelog_orphan_data *)data)->mdd;
+ struct llog_changelog_rec *rec = container_of(hdr,
+ struct llog_changelog_rec,
+ cr_hdr);
+
+ LASSERT(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN);
+
+ if (rec->cr_hdr.lrh_type != CHANGELOG_REC) {
+ CWARN("%s: invalid record at index %d in log "DFID"\n",
+ mdd2obd_dev(mdd)->obd_name, hdr->lrh_index,
+ PFID(&llh->lgh_id.lgl_oi.oi_fid));
+ /* try to find some next valid record and thus allow to recover
+ * from a corrupted LLOG, instead to assert and force a crash
+ */
+ return 0;
+ }
+
+ CDEBUG(D_INFO, "%s: seeing record at index %d/%d/%llu t=%x %.*s in log "
+ DFID"\n", mdd2obd_dev(mdd)->obd_name, hdr->lrh_index,
+ rec->cr_hdr.lrh_index, rec->cr.cr_index, rec->cr.cr_type,
+ rec->cr.cr_namelen, changelog_rec_name(&rec->cr),
+ PFID(&llh->lgh_id.lgl_oi.oi_fid));
+
+ ((struct changelog_orphan_data *)data)->index = rec->cr.cr_index;
+ return LLOG_PROC_BREAK;
+}
+
+/* find oldest changelog user index */
+static int changelog_user_detect_orphan_cb(const struct lu_env *env,
+ struct llog_handle *llh,
+ struct llog_rec_hdr *hdr, void *data)
+{
+ struct mdd_device *mdd = ((struct changelog_orphan_data *)data)->mdd;
+ struct llog_changelog_user_rec *rec = container_of(hdr,
+ struct llog_changelog_user_rec,
+ cur_hdr);
+
+ LASSERT(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN);
+
+ if (rec->cur_hdr.lrh_type != CHANGELOG_USER_REC) {
+ CWARN("%s: invalid user at index %d in log "DFID"\n",
+ mdd2obd_dev(mdd)->obd_name, hdr->lrh_index,
+ PFID(&llh->lgh_id.lgl_oi.oi_fid));
+ /* try to find some next valid record and thus allow to recover
+ * from a corrupted LLOG, instead to assert and force a crash
+ */
+ return 0;
+ }
+
+ CDEBUG(D_INFO, "%s: seeing user at index %d/%d id=%d endrec=%llu in "
+ "log "DFID"\n", mdd2obd_dev(mdd)->obd_name, hdr->lrh_index,
+ rec->cur_hdr.lrh_index, rec->cur_id, rec->cur_endrec,
+ PFID(&llh->lgh_id.lgl_oi.oi_fid));
+
+ if (((struct changelog_orphan_data *)data)->index == 0 ||
+ rec->cur_endrec < ((struct changelog_orphan_data *)data)->index)
+ ((struct changelog_orphan_data *)data)->index = rec->cur_endrec;
+
+ return 0;
+}
+
+struct changelog_cancel_cookie {
+ long long endrec;
+ struct mdd_device *mdd;
+};
+
static int llog_changelog_cancel_cb(const struct lu_env *env,
struct llog_handle *llh,
struct llog_rec_hdr *hdr, void *data)
{
struct llog_changelog_rec *rec = (struct llog_changelog_rec *)hdr;
struct llog_cookie cookie;
- long long endrec = *(long long *)data;
+ struct changelog_cancel_cookie *cl_cookie =
+ (struct changelog_cancel_cookie *)data;
int rc;
ENTRY;
/* This is always a (sub)log, not the catalog */
LASSERT(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN);
- if (rec->cr.cr_index > endrec)
+ /* if current context is GC-thread allow it to stop upon umount
+ * remaining records cleanup will occur upon next mount
+ *
+ * also during testing, wait for GC-thread to be released
+ *
+ * XXX this requires the GC-thread to not fork a sub-thread via
+ * llog[_cat]_process_or_fork() and we may think to also implement
+ * this shutdown mechanism for manually started user unregister which
+ * can also take a long time if huge backlog of records
+ */
+ if (unlikely(cl_cookie->mdd->mdd_cl.mc_gc_task == current)) {
+ /* wait to be released */
+ while (CFS_FAIL_CHECK_QUIET(OBD_FAIL_FORCE_GC_THREAD))
+ schedule();
+
+ if (kthread_should_stop())
+ RETURN(LLOG_PROC_BREAK);
+ }
+
+ if (rec->cr.cr_index > cl_cookie->endrec)
/* records are in order, so we're done */
RETURN(LLOG_PROC_BREAK);
static int llog_changelog_cancel(const struct lu_env *env,
struct llog_ctxt *ctxt,
- long long endrec)
+ struct changelog_cancel_cookie *cookie)
{
struct llog_handle *cathandle = ctxt->loc_handle;
int rc;
LASSERT(cathandle->lgh_hdr->llh_flags & LLOG_F_IS_CAT);
rc = llog_cat_process(env, cathandle, llog_changelog_cancel_cb,
- &endrec, 0, 0);
+ cookie, 0, 0);
if (rc >= 0)
/* 0 or 1 means we're done */
rc = 0;
{
struct obd_device *obd = mdd2obd_dev(mdd);
struct llog_ctxt *ctxt = NULL, *uctxt = NULL;
+ struct changelog_orphan_data changelog_orphan = { .index = 0,
+ .mdd = mdd },
+ user_orphan = { .index = 0,
+ .mdd = mdd };
int rc;
ENTRY;
if (rc < 0)
GOTO(out_uclose, rc);
}
+
+ /* find and clear any orphan changelog records (1st record index <
+ * smallest of all users current index), likely to come from an
+ * interrupted manual or GC-thread purge, as its user record had
+ * been deleted first
+ * XXX we may wait for a still registered user clear operation to
+ * do the job, but it may then take a long time to reach the user's
+ * real targetted records if a huge purge backlog is still to be
+ * processed as a long time idle user record could have been deleted
+ * XXX we may need to run end of purge as a separate thread
+ */
+ rc = llog_cat_process(env, ctxt->loc_handle, changelog_detect_orphan_cb,
+ &changelog_orphan, 0, 0);
+ if (rc < 0) {
+ CERROR("%s: changelog detect orphan failed: rc = %d\n",
+ obd->obd_name, rc);
+ GOTO(out_uclose, rc);
+ }
+ rc = llog_cat_process(env, uctxt->loc_handle,
+ changelog_user_detect_orphan_cb,
+ &user_orphan, 0, 0);
+ if (rc < 0) {
+ CERROR("%s: changelog user detect orphan failed: rc = %d\n",
+ obd->obd_name, rc);
+ GOTO(out_uclose, rc);
+ }
+ if (unlikely(changelog_orphan.index < user_orphan.index)) {
+ struct changelog_cancel_cookie cl_cookie = {
+ .endrec = user_orphan.index,
+ .mdd = mdd,
+ };
+
+ CWARN("%s : orphan changelog records found, starting from "
+ "index %llu to index %llu, being cleared now\n",
+ obd->obd_name, changelog_orphan.index, user_orphan.index);
+
+ /* XXX we may need to run end of purge as a separate thread */
+ rc = llog_changelog_cancel(env, ctxt, &cl_cookie);
+ if (rc < 0) {
+ CERROR("%s: purge of changelog orphan records failed: "
+ "rc = %d\n", obd->obd_name, rc);
+ GOTO(out_uclose, rc);
+ }
+ }
+
llog_ctxt_put(ctxt);
llog_ctxt_put(uctxt);
RETURN(0);
spin_lock_init(&mdd->mdd_cl.mc_user_lock);
mdd->mdd_cl.mc_lastuser = 0;
+ /* ensure a GC check will, and a thread run may, occur upon start */
+ mdd->mdd_cl.mc_gc_time = 0;
+ mdd->mdd_cl.mc_gc_task = MDD_CHLG_GC_NONE;
+
rc = mdd_changelog_llog_init(env, mdd);
if (rc) {
CERROR("%s: changelog setup during init failed: rc = %d\n",
mdd->mdd_cl.mc_flags = 0;
+again:
+ /* stop GC-thread if running */
+ spin_lock(&mdd->mdd_cl.mc_lock);
+ if (likely(mdd->mdd_cl.mc_gc_task == MDD_CHLG_GC_NONE)) {
+ /* avoid any attempt to run a GC-thread */
+ mdd->mdd_cl.mc_gc_task = current;
+ spin_unlock(&mdd->mdd_cl.mc_lock);
+ } else {
+ struct task_struct *gc_task;
+
+ if (unlikely(mdd->mdd_cl.mc_gc_task == MDD_CHLG_GC_NEED ||
+ mdd->mdd_cl.mc_gc_task == MDD_CHLG_GC_START)) {
+ /* need to wait for birthing GC-thread to be started
+ * and to have set mc_gc_task to itself
+ */
+ spin_unlock(&mdd->mdd_cl.mc_lock);
+ schedule_timeout(usecs_to_jiffies(10));
+ /* go back to fully check if GC-thread has started or
+ * even already exited or if a new one is starting...
+ */
+ goto again;
+ }
+ /* take a reference on task_struct to avoid it to be freed
+ * upon exit
+ */
+ gc_task = mdd->mdd_cl.mc_gc_task;
+ get_task_struct(gc_task);
+ spin_unlock(&mdd->mdd_cl.mc_lock);
+ kthread_stop(gc_task);
+ put_task_struct(gc_task);
+ }
+
ctxt = llog_get_context(obd, LLOG_CHANGELOG_ORIG_CTXT);
if (ctxt) {
llog_cat_close(env, ctxt->loc_handle);
struct obd_device *obd = mdd2obd_dev(mdd);
struct llog_ctxt *ctxt;
long long unsigned cur;
+ struct changelog_cancel_cookie cookie;
int rc;
ctxt = llog_get_context(obd, LLOG_CHANGELOG_ORIG_CTXT);
changed since the last purge) */
mdd->mdd_cl.mc_starttime = ktime_get();
- rc = llog_changelog_cancel(env, ctxt, endrec);
+ cookie.endrec = endrec;
+ cookie.mdd = mdd;
+ rc = llog_changelog_cancel(env, ctxt, &cookie);
out:
llog_ctxt_put(ctxt);
return rc;
mdd->mdd_cl.mc_users++;
rec->cur_endrec = mdd->mdd_cl.mc_index;
- rec->cur_time = (__u32)get_seconds();
+ rec->cur_time = (__u32)ktime_get_real_seconds();
if (OBD_FAIL_CHECK(OBD_FAIL_TIME_IN_CHLOG_USER))
rec->cur_time = 0;
*/
rec->cur_endrec = mcuc->mcuc_endrec;
- rec->cur_time = (__u32)get_seconds();
+ rec->cur_time = (__u32)ktime_get_real_seconds();
if (OBD_FAIL_CHECK(OBD_FAIL_TIME_IN_CHLOG_USER))
rec->cur_time = 0;
#define DEBUG_SUBSYSTEM S_MDS
-#include <linux/kthread.h>
-
#include <obd_class.h>
#include <obd_support.h>
#include <lustre_mds.h>
return rc;
}
-struct mdd_changelog_gc {
- struct mdd_device *mcgc_mdd;
- bool mcgc_found;
- __u32 mcgc_maxtime;
- __u64 mcgc_maxindexes;
- __u32 mcgc_id;
-};
-
-/* return first registered ChangeLog user idle since too long
- * use ChangeLog's user plain LLOG mtime for this */
-static int mdd_changelog_gc_cb(const struct lu_env *env,
- struct llog_handle *llh,
- struct llog_rec_hdr *hdr, void *data)
-{
- struct llog_changelog_user_rec *rec;
- struct mdd_changelog_gc *mcgc = (struct mdd_changelog_gc *)data;
- struct mdd_device *mdd = mcgc->mcgc_mdd;
- ENTRY;
-
- if ((llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) == 0)
- RETURN(-ENXIO);
-
- rec = container_of(hdr, struct llog_changelog_user_rec,
- cur_hdr);
-
- /* find oldest idle user, based on last record update/cancel time (new
- * behavior), or for old user records, last record index vs current
- * ChangeLog index. Late users with old record format will be treated
- * first as we assume they could be idle since longer
- */
- if (rec->cur_time != 0) {
- __u32 time_now = (__u32)get_seconds();
- __u32 time_out = rec->cur_time +
- mdd->mdd_changelog_max_idle_time;
- __u32 idle_time = time_now - rec->cur_time;
-
- /* treat oldest idle user first, and if no old format user
- * has been already selected
- */
- if (time_after32(time_now, time_out) &&
- idle_time > mcgc->mcgc_maxtime &&
- mcgc->mcgc_maxindexes == 0) {
- mcgc->mcgc_maxtime = idle_time;
- mcgc->mcgc_id = rec->cur_id;
- mcgc->mcgc_found = true;
- }
- } else {
- /* old user record with no idle time stamp, so use empirical
- * method based on its current index/position
- */
- __u64 idle_indexes;
-
- idle_indexes = mdd->mdd_cl.mc_index - rec->cur_endrec;
-
- /* treat user with the oldest/smallest current index first */
- if (idle_indexes >= mdd->mdd_changelog_max_idle_indexes &&
- idle_indexes > mcgc->mcgc_maxindexes) {
- mcgc->mcgc_maxindexes = idle_indexes;
- mcgc->mcgc_id = rec->cur_id;
- mcgc->mcgc_found = true;
- }
-
- }
- RETURN(0);
-}
-
-/* recover space from long-term inactive ChangeLog users */
-static int mdd_chlg_garbage_collect(void *data)
-{
- struct mdd_device *mdd = (struct mdd_device *)data;
- struct lu_env *env = NULL;
- int rc;
- struct llog_ctxt *ctxt;
- struct mdd_changelog_gc mcgc = {
- .mcgc_mdd = mdd,
- .mcgc_found = false,
- .mcgc_maxtime = 0,
- .mcgc_maxindexes = 0,
- };
- ENTRY;
-
- CDEBUG(D_HA, "%s: ChangeLog garbage collect thread start\n",
- mdd2obd_dev(mdd)->obd_name);
-
- OBD_ALLOC_PTR(env);
- if (env == NULL)
- GOTO(out, rc = -ENOMEM);
-
- rc = lu_env_init(env, LCT_MD_THREAD);
- if (rc)
- GOTO(out, rc);
-
- for (;;) {
- ctxt = llog_get_context(mdd2obd_dev(mdd),
- LLOG_CHANGELOG_USER_ORIG_CTXT);
- if (ctxt == NULL ||
- (ctxt->loc_handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) == 0)
- GOTO(out_env, rc = -ENXIO);
-
- rc = llog_cat_process(env, ctxt->loc_handle,
- mdd_changelog_gc_cb, &mcgc, 0, 0);
- if (rc != 0 || mcgc.mcgc_found == false)
- break;
- llog_ctxt_put(ctxt);
-
- CWARN("%s: Force deregister of ChangeLog user cl%d idle more "
- "than %us\n", mdd2obd_dev(mdd)->obd_name, mcgc.mcgc_id,
- mcgc.mcgc_maxtime);
-
- mdd_changelog_user_purge(env, mdd, mcgc.mcgc_id);
-
- /* try again to search for another candidate */
- mcgc.mcgc_found = false;
- mcgc.mcgc_maxtime = 0;
- mcgc.mcgc_maxindexes = 0;
- }
-
-out_env:
- if (ctxt != NULL)
- llog_ctxt_put(ctxt);
-
- lu_env_fini(env);
- GOTO(out, rc);
-out:
- if (env)
- OBD_FREE_PTR(env);
- mdd->mdd_cl.mc_gc_task = NULL;
- return rc;
-}
-
/** Add a changelog entry \a rec to the changelog llog
* \param mdd
* \param rec
struct llog_ctxt *ctxt;
struct thandle *llog_th;
int rc;
- bool run_gc_task = false;
rec->cr_hdr.lrh_len = llog_data_len(sizeof(*rec) +
changelog_rec_varsize(&rec->cr));
rc = llog_add(env, ctxt->loc_handle, &rec->cr_hdr, NULL, llog_th);
/* time to recover some space ?? */
+ if (likely(!mdd->mdd_changelog_gc ||
+ mdd->mdd_cl.mc_gc_task != MDD_CHLG_GC_NONE ||
+ mdd->mdd_changelog_min_gc_interval >=
+ ktime_get_real_seconds() - mdd->mdd_cl.mc_gc_time))
+ /* save a spin_lock trip */
+ goto out_put;
spin_lock(&mdd->mdd_cl.mc_lock);
- if (unlikely(mdd->mdd_changelog_gc && (ktime_get_real_seconds() -
- mdd->mdd_cl.mc_gc_time > mdd->mdd_changelog_min_gc_interval) &&
- mdd->mdd_cl.mc_gc_task == NULL &&
- llog_cat_free_space(ctxt->loc_handle) <=
- mdd->mdd_changelog_min_free_cat_entries)) {
- CWARN("%s: low on changelog_catalog free entries, starting "
- "ChangeLog garbage collection thread\n", obd->obd_name);
-
- /* indicate further kthread run will occur outside right after
- * critical section
+ if (likely(mdd->mdd_changelog_gc &&
+ mdd->mdd_cl.mc_gc_task == MDD_CHLG_GC_NONE &&
+ ktime_get_real_seconds() - mdd->mdd_cl.mc_gc_time >
+ mdd->mdd_changelog_min_gc_interval)) {
+ if (unlikely(llog_cat_free_space(ctxt->loc_handle) <=
+ mdd->mdd_changelog_min_free_cat_entries ||
+ OBD_FAIL_CHECK(OBD_FAIL_FORCE_GC_THREAD))) {
+ CWARN("%s:%s low on changelog_catalog free entries, "
+ "starting ChangeLog garbage collection thread\n",
+ obd->obd_name,
+ OBD_FAIL_CHECK(OBD_FAIL_FORCE_GC_THREAD) ?
+ " simulate" : "");
+
+ /* indicate further kthread run will occur outside
+ * right after current journal transaction filling has
+ * completed
+ */
+ mdd->mdd_cl.mc_gc_task = MDD_CHLG_GC_NEED;
+ }
+ /* next check in mdd_changelog_min_gc_interval anyway
*/
- mdd->mdd_cl.mc_gc_task = (struct task_struct *)(-1);
- run_gc_task = true;
+ mdd->mdd_cl.mc_gc_time = ktime_get_real_seconds();
}
spin_unlock(&mdd->mdd_cl.mc_lock);
- if (run_gc_task) {
- struct task_struct *gc_task;
-
- gc_task = kthread_run(mdd_chlg_garbage_collect, mdd,
- "chlg_gc_thread");
- if (IS_ERR(gc_task)) {
- CERROR("%s: cannot start ChangeLog garbage collection "
- "thread: rc = %ld\n", obd->obd_name,
- PTR_ERR(gc_task));
- mdd->mdd_cl.mc_gc_task = NULL;
- } else {
- CDEBUG(D_HA, "%s: ChangeLog garbage collection thread "
- "has started with Pid %d\n", obd->obd_name,
- gc_task->pid);
- mdd->mdd_cl.mc_gc_task = gc_task;
- mdd->mdd_cl.mc_gc_time = ktime_get_real_seconds();
- }
- }
out_put:
llog_ctxt_put(ctxt);
if (rc > 0)
#define LLOG_CHANGELOG_HDR_SZ (sizeof(struct llog_changelog_rec) - \
sizeof(struct changelog_rec))
+/* mc_gc_task values */
+/** no GC thread to be started **/
+#define MDD_CHLG_GC_NONE NULL
+/** a GC thread need to be started **/
+#define MDD_CHLG_GC_NEED (struct task_struct *)(-1)
+/** a GC thread will be started now **/
+#define MDD_CHLG_GC_START (struct task_struct *)(-2)
+/** else the started task_struct address when running **/
struct mdd_changelog {
spinlock_t mc_lock; /* for index */
int mc_lastuser;
int mc_users; /* registered users number */
struct task_struct *mc_gc_task;
- time64_t mc_gc_time;
+ time64_t mc_gc_time; /* last GC check or run time */
unsigned int mc_deniednext; /* interval for recording denied
* accesses
*/
if (rc)
return rc;
- /* XXX may need to limit with reasonable elapsed/idle times */
- if (val < 1)
+ /* as it sounds reasonable, do not allow a user to be idle since
+ * more than about 68 years, this will allow to use 32bits
+ * timestamps for comparison
+ */
+ if (val < 1 || val > INT_MAX)
return -ERANGE;
mdd->mdd_changelog_max_idle_time = val;
#define DEBUG_SUBSYSTEM S_MDS
+#include <linux/kthread.h>
+
#include <obd_class.h>
#include <lprocfs_status.h>
#include <lustre_mds.h>
return mdd_child_ops(mdd)->dt_trans_start(env, mdd->mdd_child, th);
}
+struct mdd_changelog_gc {
+ struct mdd_device *mcgc_mdd;
+ __u32 mcgc_id;
+ __u32 mcgc_maxtime;
+ __u64 mcgc_maxindexes;
+ bool mcgc_found;
+};
+
+/* return first registered ChangeLog user idle since too long
+ * use ChangeLog's user plain LLOG mtime for this */
+static int mdd_changelog_gc_cb(const struct lu_env *env,
+ struct llog_handle *llh,
+ struct llog_rec_hdr *hdr, void *data)
+{
+ struct llog_changelog_user_rec *rec;
+ struct mdd_changelog_gc *mcgc = (struct mdd_changelog_gc *)data;
+ struct mdd_device *mdd = mcgc->mcgc_mdd;
+ ENTRY;
+
+ if ((llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) == 0)
+ RETURN(-ENXIO);
+
+ rec = container_of(hdr, struct llog_changelog_user_rec,
+ cur_hdr);
+
+ /* find oldest idle user, based on last record update/cancel time (new
+ * behavior), or for old user records, last record index vs current
+ * ChangeLog index. Late users with old record format will be treated
+ * first as we assume they could be idle since longer
+ */
+ if (rec->cur_time != 0) {
+ __u32 time_now = (__u32)ktime_get_real_seconds();
+ __u32 time_out = rec->cur_time +
+ mdd->mdd_changelog_max_idle_time;
+ __u32 idle_time = time_now - rec->cur_time;
+
+ /* treat oldest idle user first, and if no old format user
+ * has been already selected
+ */
+ if (time_after32(time_now, time_out) &&
+ idle_time > mcgc->mcgc_maxtime &&
+ mcgc->mcgc_maxindexes == 0) {
+ mcgc->mcgc_maxtime = idle_time;
+ mcgc->mcgc_id = rec->cur_id;
+ mcgc->mcgc_found = true;
+ }
+ } else {
+ /* old user record with no idle time stamp, so use empirical
+ * method based on its current index/position
+ */
+ __u64 idle_indexes;
+
+ idle_indexes = mdd->mdd_cl.mc_index - rec->cur_endrec;
+
+ /* treat user with the oldest/smallest current index first */
+ if (idle_indexes >= mdd->mdd_changelog_max_idle_indexes &&
+ idle_indexes > mcgc->mcgc_maxindexes) {
+ mcgc->mcgc_maxindexes = idle_indexes;
+ mcgc->mcgc_id = rec->cur_id;
+ mcgc->mcgc_found = true;
+ }
+
+ }
+ RETURN(0);
+}
+
+/* recover space from long-term inactive ChangeLog users */
+static int mdd_chlg_garbage_collect(void *data)
+{
+ struct mdd_device *mdd = (struct mdd_device *)data;
+ struct lu_env *env = NULL;
+ int rc;
+ struct llog_ctxt *ctxt;
+ struct mdd_changelog_gc mcgc = {
+ .mcgc_mdd = mdd,
+ .mcgc_found = false,
+ .mcgc_maxtime = 0,
+ .mcgc_maxindexes = 0,
+ };
+ ENTRY;
+
+ mdd->mdd_cl.mc_gc_task = current;
+
+ CDEBUG(D_HA, "%s: ChangeLog garbage collect thread start with PID %d\n",
+ mdd2obd_dev(mdd)->obd_name, current->pid);
+
+ OBD_ALLOC_PTR(env);
+ if (env == NULL)
+ GOTO(out, rc = -ENOMEM);
+
+ rc = lu_env_init(env, LCT_MD_THREAD);
+ if (rc)
+ GOTO(out, rc);
+
+ for (;;) {
+ ctxt = llog_get_context(mdd2obd_dev(mdd),
+ LLOG_CHANGELOG_USER_ORIG_CTXT);
+ if (ctxt == NULL ||
+ (ctxt->loc_handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) == 0)
+ GOTO(out_ctxt, rc = -ENXIO);
+
+ rc = llog_cat_process(env, ctxt->loc_handle,
+ mdd_changelog_gc_cb, &mcgc, 0, 0);
+ if (rc != 0 || mcgc.mcgc_found == false)
+ break;
+ llog_ctxt_put(ctxt);
+
+ if (mcgc.mcgc_maxindexes != 0)
+ CWARN("%s: Force deregister of ChangeLog user cl%d "
+ "idle with more than %llu unprocessed records\n",
+ mdd2obd_dev(mdd)->obd_name, mcgc.mcgc_id,
+ mcgc.mcgc_maxindexes);
+ else
+ CWARN("%s: Force deregister of ChangeLog user cl%d "
+ "idle since more than %us\n",
+ mdd2obd_dev(mdd)->obd_name, mcgc.mcgc_id,
+ mcgc.mcgc_maxtime);
+
+ mdd_changelog_user_purge(env, mdd, mcgc.mcgc_id);
+
+ if (kthread_should_stop())
+ GOTO(out_env, rc = 0);
+
+ /* try again to search for another candidate */
+ mcgc.mcgc_found = false;
+ mcgc.mcgc_maxtime = 0;
+ mcgc.mcgc_maxindexes = 0;
+ }
+
+out_ctxt:
+ if (ctxt != NULL)
+ llog_ctxt_put(ctxt);
+
+out_env:
+ lu_env_fini(env);
+ GOTO(out, rc);
+out:
+ if (env)
+ OBD_FREE_PTR(env);
+
+ spin_lock(&mdd->mdd_cl.mc_lock);
+ mdd->mdd_cl.mc_gc_task = MDD_CHLG_GC_NONE;
+ spin_unlock(&mdd->mdd_cl.mc_lock);
+
+ return rc;
+}
+
int mdd_trans_stop(const struct lu_env *env, struct mdd_device *mdd,
int result, struct thandle *handle)
{
rc = mdd_child_ops(mdd)->dt_trans_stop(env, mdd->mdd_child, handle);
barrier_exit(mdd->mdd_bottom);
+ /* bottom half of changelog garbage-collection mechanism, started
+ * from mdd_changelog_store(). This is required, as running a
+ * kthead can't occur during a journal transaction is being filled
+ * because otherwise a deadlock can happen if memory reclaim is
+ * triggered by kthreadd when forking the new thread, and thus
+ * I/Os could be attempted to the same device from shrinkers
+ * requiring a new journal transaction to be started when current
+ * could never complete (LU-10680).
+ */
+ if (unlikely(mdd->mdd_cl.mc_flags & CLM_ON &&
+ cmpxchg(&mdd->mdd_cl.mc_gc_task, MDD_CHLG_GC_NEED,
+ MDD_CHLG_GC_START) == MDD_CHLG_GC_NEED)) {
+ /* XXX we may want to cmpxchg() only if MDD_CHLG_GC_NEED
+ * to save its cost in the frequent case and have an extra
+ * if/test cost in the rare case where we need to spawn?
+ */
+ struct task_struct *gc_task;
+ struct obd_device *obd = mdd2obd_dev(mdd);
+
+ gc_task = kthread_run(mdd_chlg_garbage_collect, mdd,
+ "chlg_gc_thread");
+ if (IS_ERR(gc_task)) {
+ CERROR("%s: cannot start ChangeLog garbage collection "
+ "thread: rc = %ld\n", obd->obd_name,
+ PTR_ERR(gc_task));
+ mdd->mdd_cl.mc_gc_task = MDD_CHLG_GC_NONE;
+ } else {
+ CDEBUG(D_HA, "%s: a ChangeLog garbage collection "
+ "thread has been started\n", obd->obd_name);
+ }
+ }
+
/* if operation failed, return \a result, otherwise return status of
* dt_trans_stop */
return result ?: rc;
ALWAYS_EXCEPT="$SANITY_EXCEPT 42a 42b 42c 77k"
# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
-# skipped tests: LU-8411 LU-9096 LU-9054 LU-10680 ..
-ALWAYS_EXCEPT=" 407 253 312 160f 160g $ALWAYS_EXCEPT"
+# skipped tests: LU-8411 LU-9096 LU-9054 LU-10734 ..
+ALWAYS_EXCEPT=" 407 253 312 160g $ALWAYS_EXCEPT"
# Check Grants after these tests
GRANT_CHECK_LIST="$GRANT_CHECK_LIST 42a 42b 42c 42d 42e 63a 63b 64a 64b 64c"
run_test 160e "changelog negative testing (should return errors)"
test_160f() {
- remote_mds_nodsh && skip "remote MDS with nodsh"
+ remote_mds_nodsh && skip "remote MDS with nodsh" && return
[[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.10.56) ]] ||
- skip "Need MDS version at least 2.10.56"
+ { skip "Need MDS version at least 2.10.56"; return 0; }
local mdts=$(comma_list $(mdts_nodes))
# Create a user
changelog_register || error "first changelog_register failed"
changelog_register || error "second changelog_register failed"
- local cl_users=(${CL_USERS[$SINGLEMDS]})
- local cl_user1="${cl_users[0]}"
- local cl_user2="${cl_users[1]}"
+ local cl_users
+ declare -A cl_user1
+ declare -A cl_user2
+ local user_rec1
+ local user_rec2
+ local i
# generate some changelog records to accumulate on each MDT
test_mkdir -c $MDSCOUNT $DIR/$tdir || error "test_mkdir $tdir failed"
error "create $DIR/$tdir/$tfile failed"
# check changelogs have been generated
- nbcl=$(changelog_dump | wc -l)
+ local nbcl=$(changelog_dump | wc -l)
[[ $nbcl -eq 0 ]] && error "no changelogs found"
- # changelog_gc=1 should be set by default
for param in "changelog_max_idle_time=10" \
+ "changelog_gc=1" \
"changelog_min_gc_interval=2" \
"changelog_min_free_cat_entries=3"; do
local MDT0=$(facet_svc $SINGLEMDS)
do_nodes $mdts $LCTL set_param mdd.*.$param
done
+ # force cl_user2 to be idle (1st part)
+ sleep 9
+
# simulate changelog catalog almost full
#define OBD_FAIL_CAT_FREE_RECORDS 0x1313
do_nodes $mdts $LCTL set_param fail_loc=0x1313 fail_val=3
- sleep 6
- local user_rec1=$(changelog_user_rec $SINGLEMDS $cl_user1)
- [ -n "$user_rec1" ] ||
- error "User $cl_user1 not found in changelog_users"
- __changelog_clear $SINGLEMDS $cl_user1 +2
- local user_rec2=$(changelog_user_rec $SINGLEMDS $cl_user1)
- [ -n "$user_rec2" ] ||
- error "User $cl_user1 not found in changelog_users"
- echo "verifying user clear: $user_rec1 + 2 == $user_rec2"
- [ $((user_rec1 + 2)) == $user_rec2 ] ||
- error "user index expected $user_rec1 + 2, but is $user_rec2"
- sleep 5
+ for i in $(seq $MDSCOUNT); do
+ cl_users=(${CL_USERS[mds$i]})
+ cl_user1[mds$i]="${cl_users[0]}"
+ cl_user2[mds$i]="${cl_users[1]}"
+
+ [ -n "${cl_user1[mds$i]}" ] ||
+ error "mds$i: no user registered"
+ [ -n "${cl_user2[mds$i]}" ] ||
+ error "mds$i: only ${cl_user2[mds$i]} is registered"
+
+ user_rec1=$(changelog_user_rec mds$i ${cl_user1[mds$i]})
+ [ -n "$user_rec1" ] ||
+ error "mds$i: User ${cl_user1[mds$i]} not registered"
+ __changelog_clear mds$i ${cl_user1[mds$i]} +2
+ user_rec2=$(changelog_user_rec mds$i ${cl_user1[mds$i]})
+ [ -n "$user_rec2" ] ||
+ error "mds$i: User ${cl_user1[mds$i]} not registered"
+ echo "mds$i: verifying user ${cl_user1[mds$i]} clear: " \
+ "$user_rec1 + 2 == $user_rec2"
+ [ $((user_rec1 + 2)) == $user_rec2 ] ||
+ error "mds$i: user ${cl_user1[mds$i]} index expected " \
+ "$user_rec1 + 2, but is $user_rec2"
+ user_rec2=$(changelog_user_rec mds$i ${cl_user2[mds$i]})
+ [ -n "$user_rec2" ] ||
+ error "mds$i: User ${cl_user2[mds$i]} not registered"
+ [ $user_rec1 == $user_rec2 ] ||
+ error "mds$i: user ${cl_user2[mds$i]} index expected " \
+ "$user_rec1, but is $user_rec2"
+ done
+
+ # force cl_user2 to be idle (2nd part) and to reach
+ # changelog_max_idle_time
+ sleep 2
# generate one more changelog to trigger fail_loc
- rm -rf $DIR/$tdir || error "rm -rf $tdir failed"
+ createmany -m $DIR/$tdir/${tfile}bis $((MDSCOUNT * 2)) ||
+ error "create $DIR/$tdir/${tfile}bis failed"
# ensure gc thread is done
- wait_update_facet $SINGLEMDS \
- "ps -e -o comm= | grep chlg_gc_thread" "" 20
-
- # check user still registered
- changelog_users $SINGLEMDS | grep -q "$cl_user1" ||
- error "User $cl_user1 not found in changelog_users"
- # check user2 unregistered
- changelog_users $SINGLEMDS | grep -q "$cl_user2" &&
- error "User $cl_user2 still found in changelog_users"
-
- # check changelogs are present and starting at $user_rec2 + 1
- local first_rec=$($LFS changelog $(facet_svc $SINGLEMDS) |
- awk '{ print $1; exit; }')
+ for i in $(mdts_nodes); do
+ wait_update $i \
+ "ps -e -o comm= | grep chlg_gc_thread" "" 20 ||
+ error "$i: GC-thread not done"
+ done
- echo "verifying min purge: $user_rec2 + 1 == $first_rec"
- [ $((user_rec2 + 1)) == $first_rec ] ||
- error "first index should be $user_rec2 + 1, but is $first_rec"
+ local first_rec
+ for i in $(seq $MDSCOUNT); do
+ # check cl_user1 still registered
+ changelog_users mds$i | grep -q "${cl_user1[mds$i]}" ||
+ error "mds$i: User ${cl_user1[mds$i]} not registered"
+ # check cl_user2 unregistered
+ changelog_users mds$i | grep -q "${cl_user2[mds$i]}" &&
+ error "mds$i: User ${cl_user2[mds$i]} still registered"
+
+ # check changelogs are present and starting at $user_rec1 + 1
+ user_rec1=$(changelog_user_rec mds$i ${cl_user1[mds$i]})
+ [ -n "$user_rec1" ] ||
+ error "mds$i: User ${cl_user1[mds$i]} not registered"
+ first_rec=$($LFS changelog $(facet_svc mds$i) |
+ awk '{ print $1; exit; }')
+
+ echo "mds$i: verifying first index $user_rec1 + 1 == $first_rec"
+ [ $((user_rec1 + 1)) == $first_rec ] ||
+ error "mds$i: first index should be $user_rec1 + 1, " \
+ "but is $first_rec"
+ done
}
run_test 160f "changelog garbage collect (timestamped users)"
run_test 160g "changelog garbage collect (old users)"
test_160h() {
+ remote_mds_nodsh && skip "remote MDS with nodsh" && return
+ [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.10.56) ]] ||
+ { skip "Need MDS version at least 2.10.56"; return 0; }
+
+ local mdts=$(comma_list $(mdts_nodes))
+
+ # Create a user
+ changelog_register || error "first changelog_register failed"
+ changelog_register || error "second changelog_register failed"
+ local cl_users
+ declare -A cl_user1
+ declare -A cl_user2
+ local user_rec1
+ local user_rec2
+ local i
+
+ # generate some changelog records to accumulate on each MDT
+ test_mkdir -c $MDSCOUNT $DIR/$tdir || error "test_mkdir $tdir failed"
+ createmany -m $DIR/$tdir/$tfile $((MDSCOUNT * 2)) ||
+ error "create $DIR/$tdir/$tfile failed"
+
+ # check changelogs have been generated
+ local nbcl=$(changelog_dump | wc -l)
+ [[ $nbcl -eq 0 ]] && error "no changelogs found"
+
+ for param in "changelog_max_idle_time=10" \
+ "changelog_gc=1" \
+ "changelog_min_gc_interval=2"; do
+ local MDT0=$(facet_svc $SINGLEMDS)
+ local var="${param%=*}"
+ local old=$(do_facet mds1 "$LCTL get_param -n mdd.$MDT0.$var")
+
+ stack_trap "do_nodes $mdts $LCTL set_param mdd.*.$var=$old" EXIT
+ do_nodes $mdts $LCTL set_param mdd.*.$param
+ done
+
+ # force cl_user2 to be idle (1st part)
+ sleep 9
+
+ for i in $(seq $MDSCOUNT); do
+ cl_users=(${CL_USERS[mds$i]})
+ cl_user1[mds$i]="${cl_users[0]}"
+ cl_user2[mds$i]="${cl_users[1]}"
+
+ [ -n "${cl_user1[mds$i]}" ] ||
+ error "mds$i: no user registered"
+ [ -n "${cl_user2[mds$i]}" ] ||
+ error "mds$i: only ${cl_user2[mds$i]} is registered"
+
+ user_rec1=$(changelog_user_rec mds$i ${cl_user1[mds$i]})
+ [ -n "$user_rec1" ] ||
+ error "mds$i: User ${cl_user1[mds$i]} not registered"
+ __changelog_clear mds$i ${cl_user1[mds$i]} +2
+ user_rec2=$(changelog_user_rec mds$i ${cl_user1[mds$i]})
+ [ -n "$user_rec2" ] ||
+ error "mds$i: User ${cl_user1[mds$i]} not registered"
+ echo "mds$i: verifying user ${cl_user1[mds$i]} clear: " \
+ "$user_rec1 + 2 == $user_rec2"
+ [ $((user_rec1 + 2)) == $user_rec2 ] ||
+ error "mds$i: user ${cl_user1[mds$i]} index expected " \
+ "$user_rec1 + 2, but is $user_rec2"
+ user_rec2=$(changelog_user_rec mds$i ${cl_user2[mds$i]})
+ [ -n "$user_rec2" ] ||
+ error "mds$i: User ${cl_user2[mds$i]} not registered"
+ [ $user_rec1 == $user_rec2 ] ||
+ error "mds$i: user ${cl_user2[mds$i]} index expected " \
+ "$user_rec1, but is $user_rec2"
+ done
+
+ # force cl_user2 to be idle (2nd part) and to reach
+ # changelog_max_idle_time
+ sleep 2
+
+ # force each GC-thread start and block then
+ # one per MDT/MDD, set fail_val accordingly
+ #define OBD_FAIL_FORCE_GC_THREAD 0x1316
+ do_nodes $mdts $LCTL set_param fail_loc=0x1316
+
+ # generate more changelogs to trigger fail_loc
+ createmany -m $DIR/$tdir/${tfile}bis $((MDSCOUNT * 2)) ||
+ error "create $DIR/$tdir/${tfile}bis failed"
+
+ # stop MDT to stop GC-thread, should be done in back-ground as it will
+ # block waiting for the thread to be released and exit
+ declare -A stop_pids
+ for i in $(seq $MDSCOUNT); do
+ stop mds$i &
+ stop_pids[mds$i]=$!
+ done
+
+ for i in $(mdts_nodes); do
+ local facet
+ local nb=0
+ local facets=$(facets_up_on_host $i)
+
+ for facet in ${facets//,/ }; do
+ if [[ $facet == mds* ]]; then
+ nb=$((nb + 1))
+ fi
+ done
+ # ensure each MDS's gc threads are still present and all in "R"
+ # state (OBD_FAIL_FORCE_GC_THREAD effect!)
+ [[ $(do_node $i pgrep chlg_gc_thread | wc -l) -eq $nb ]] ||
+ error "$i: expected $nb GC-thread"
+ wait_update $i \
+ "ps -C chlg_gc_thread -o state --no-headers | uniq" \
+ "R" 20 ||
+ error "$i: GC-thread not found in R-state"
+ # check umounts of each MDT on MDS have reached kthread_stop()
+ [[ $(do_node $i pgrep umount | wc -l) -eq $nb ]] ||
+ error "$i: expected $nb umount"
+ wait_update $i \
+ "ps -C umount -o state --no-headers | uniq" "D" 20 ||
+ error "$i: umount not found in D-state"
+ done
+
+ # release all GC-threads
+ do_nodes $mdts $LCTL set_param fail_loc=0
+
+ # wait for MDT stop to complete
+ for i in $(seq $MDSCOUNT); do
+ wait ${stop_pids[mds$i]} || error "mds$i: stop failed"
+ done
+
+ # XXX
+ # may try to check if any orphan changelog records are present
+ # via ldiskfs/zfs and llog_reader...
+
+ # re-start/mount MDTs
+ for i in $(seq $MDSCOUNT); do
+ start mds$i $(mdsdevname $i) $MDS_MOUNT_OPTS ||
+ error "Fail to start mds$i"
+ done
+
+ local first_rec
+ for i in $(seq $MDSCOUNT); do
+ # check cl_user1 still registered
+ changelog_users mds$i | grep -q "${cl_user1[mds$i]}" ||
+ error "mds$i: User ${cl_user1[mds$i]} not registered"
+ # check cl_user2 unregistered
+ changelog_users mds$i | grep -q "${cl_user2[mds$i]}" &&
+ error "mds$i: User ${cl_user2[mds$i]} still registered"
+
+ # check changelogs are present and starting at $user_rec1 + 1
+ user_rec1=$(changelog_user_rec mds$i ${cl_user1[mds$i]})
+ [ -n "$user_rec1" ] ||
+ error "mds$i: User ${cl_user1[mds$i]} not registered"
+ first_rec=$($LFS changelog $(facet_svc mds$i) |
+ awk '{ print $1; exit; }')
+
+ echo "mds$i: verifying first index $user_rec1 + 1 == $first_rec"
+ [ $((user_rec1 + 1)) == $first_rec ] ||
+ error "mds$i: first index should be $user_rec1 + 1, " \
+ "but is $first_rec"
+ done
+}
+run_test 160h "changelog gc thread stop upon umount, orphan records delete " \
+ "during mount"
+
+test_160i() {
local mdts=$(comma_list $(mdts_nodes))
error "changelogs are off on mds$i"
done
}
-run_test 160h "changelog user register/unregister race"
+run_test 160i "changelog user register/unregister race"
test_161a() {
[ $PARALLEL == "yes" ] && skip "skip parallel run"