/* LU-8040 Set defaults here, before values configs */
mdd->mdd_cl.mc_flags = 0; /* off by default */
mdd->mdd_cl.mc_mask = CHANGELOG_DEFMASK;
+ mdd->mdd_cl.mc_deniednext = 60; /* 60 secs by default */
dev = lustre_cfg_string(lcfg, 0);
if (dev == NULL)
mdd->mdd_changelog_max_idle_indexes = CHLOG_MAX_IDLE_INDEXES;
/* with a reasonable interval between each check */
mdd->mdd_changelog_min_gc_interval = CHLOG_MIN_GC_INTERVAL;
- /* with a very few number of free entries */
+ /* with a very few number of free catalog entries */
mdd->mdd_changelog_min_free_cat_entries = CHLOG_MIN_FREE_CAT_ENTRIES;
+ /* special default striping for files created with O_APPEND */
+ mdd->mdd_append_stripe_count = 1;
+ mdd->mdd_append_pool[0] = '\0';
dt_conf_get(env, mdd->mdd_child, &mdd->mdd_dt_conf);
spin_lock(&mdd->mdd_cl.mc_user_lock);
mdd->mdd_cl.mc_lastuser = rec->cur_id;
+ mdd->mdd_cl.mc_users++;
if (rec->cur_endrec > mdd->mdd_cl.mc_index)
mdd->mdd_cl.mc_index = rec->cur_endrec;
spin_unlock(&mdd->mdd_cl.mc_user_lock);
return LLOG_PROC_BREAK;
}
+struct changelog_orphan_data {
+ __u64 index;
+ struct mdd_device *mdd;
+};
+
+/* find oldest changelog record index */
+static int changelog_detect_orphan_cb(const struct lu_env *env,
+ struct llog_handle *llh,
+ struct llog_rec_hdr *hdr, void *data)
+{
+ struct mdd_device *mdd = ((struct changelog_orphan_data *)data)->mdd;
+ struct llog_changelog_rec *rec = container_of(hdr,
+ struct llog_changelog_rec,
+ cr_hdr);
+
+ LASSERT(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN);
+
+ if (rec->cr_hdr.lrh_type != CHANGELOG_REC) {
+ CWARN("%s: invalid record at index %d in log "DFID"\n",
+ mdd2obd_dev(mdd)->obd_name, hdr->lrh_index,
+ PFID(&llh->lgh_id.lgl_oi.oi_fid));
+ /* try to find some next valid record and thus allow to recover
+ * from a corrupted LLOG, instead to assert and force a crash
+ */
+ return 0;
+ }
+
+ CDEBUG(D_INFO, "%s: seeing record at index %d/%d/%llu t=%x %.*s in log "
+ DFID"\n", mdd2obd_dev(mdd)->obd_name, hdr->lrh_index,
+ rec->cr_hdr.lrh_index, rec->cr.cr_index, rec->cr.cr_type,
+ rec->cr.cr_namelen, changelog_rec_name(&rec->cr),
+ PFID(&llh->lgh_id.lgl_oi.oi_fid));
+
+ ((struct changelog_orphan_data *)data)->index = rec->cr.cr_index;
+ return LLOG_PROC_BREAK;
+}
+
+/* find oldest changelog user index */
+static int changelog_user_detect_orphan_cb(const struct lu_env *env,
+ struct llog_handle *llh,
+ struct llog_rec_hdr *hdr, void *data)
+{
+ struct mdd_device *mdd = ((struct changelog_orphan_data *)data)->mdd;
+ struct llog_changelog_user_rec *rec = container_of(hdr,
+ struct llog_changelog_user_rec,
+ cur_hdr);
+
+ LASSERT(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN);
+
+ if (rec->cur_hdr.lrh_type != CHANGELOG_USER_REC) {
+ CWARN("%s: invalid user at index %d in log "DFID"\n",
+ mdd2obd_dev(mdd)->obd_name, hdr->lrh_index,
+ PFID(&llh->lgh_id.lgl_oi.oi_fid));
+ /* try to find some next valid record and thus allow to recover
+ * from a corrupted LLOG, instead to assert and force a crash
+ */
+ return 0;
+ }
+
+ CDEBUG(D_INFO, "%s: seeing user at index %d/%d id=%d endrec=%llu in "
+ "log "DFID"\n", mdd2obd_dev(mdd)->obd_name, hdr->lrh_index,
+ rec->cur_hdr.lrh_index, rec->cur_id, rec->cur_endrec,
+ PFID(&llh->lgh_id.lgl_oi.oi_fid));
+
+ if (((struct changelog_orphan_data *)data)->index == 0 ||
+ rec->cur_endrec < ((struct changelog_orphan_data *)data)->index)
+ ((struct changelog_orphan_data *)data)->index = rec->cur_endrec;
+
+ return 0;
+}
+
+struct changelog_cancel_cookie {
+ long long endrec;
+ struct mdd_device *mdd;
+};
+
static int llog_changelog_cancel_cb(const struct lu_env *env,
struct llog_handle *llh,
struct llog_rec_hdr *hdr, void *data)
{
struct llog_changelog_rec *rec = (struct llog_changelog_rec *)hdr;
struct llog_cookie cookie;
- long long endrec = *(long long *)data;
+ struct changelog_cancel_cookie *cl_cookie =
+ (struct changelog_cancel_cookie *)data;
int rc;
ENTRY;
/* This is always a (sub)log, not the catalog */
LASSERT(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN);
- if (rec->cr.cr_index > endrec)
+ /* if current context is GC-thread allow it to stop upon umount
+ * remaining records cleanup will occur upon next mount
+ *
+ * also during testing, wait for GC-thread to be released
+ *
+ * XXX this requires the GC-thread to not fork a sub-thread via
+ * llog[_cat]_process_or_fork() and we may think to also implement
+ * this shutdown mechanism for manually started user unregister which
+ * can also take a long time if huge backlog of records
+ */
+ if (unlikely(cl_cookie->mdd->mdd_cl.mc_gc_task == current)) {
+ /* wait to be released */
+ while (CFS_FAIL_CHECK_QUIET(OBD_FAIL_FORCE_GC_THREAD))
+ schedule();
+
+ if (kthread_should_stop())
+ RETURN(LLOG_PROC_BREAK);
+ }
+
+ if (rec->cr.cr_index > cl_cookie->endrec)
/* records are in order, so we're done */
RETURN(LLOG_PROC_BREAK);
static int llog_changelog_cancel(const struct lu_env *env,
struct llog_ctxt *ctxt,
- struct llog_cookie *cookies, int flags)
+ struct changelog_cancel_cookie *cookie)
{
struct llog_handle *cathandle = ctxt->loc_handle;
int rc;
LASSERT(cathandle->lgh_hdr->llh_flags & LLOG_F_IS_CAT);
rc = llog_cat_process(env, cathandle, llog_changelog_cancel_cb,
- (void *)cookies, 0, 0);
+ cookie, 0, 0);
if (rc >= 0)
/* 0 or 1 means we're done */
rc = 0;
{
struct obd_device *obd = mdd2obd_dev(mdd);
struct llog_ctxt *ctxt = NULL, *uctxt = NULL;
+ struct changelog_orphan_data changelog_orphan = { .index = 0,
+ .mdd = mdd },
+ user_orphan = { .index = 0,
+ .mdd = mdd };
int rc;
ENTRY;
if (rc)
GOTO(out_ucleanup, rc);
- uctxt->loc_handle->lgh_logops->lop_add = llog_cat_add_rec;
- uctxt->loc_handle->lgh_logops->lop_declare_add = llog_cat_declare_add_rec;
-
rc = llog_init_handle(env, uctxt->loc_handle, LLOG_F_IS_CAT, NULL);
if (rc)
GOTO(out_uclose, rc);
if (rc < 0)
GOTO(out_uclose, rc);
}
+
+ /* find and clear any orphan changelog records (1st record index <
+ * smallest of all users current index), likely to come from an
+ * interrupted manual or GC-thread purge, as its user record had
+ * been deleted first
+ * XXX we may wait for a still registered user clear operation to
+ * do the job, but it may then take a long time to reach the user's
+ * real targetted records if a huge purge backlog is still to be
+ * processed as a long time idle user record could have been deleted
+ * XXX we may need to run end of purge as a separate thread
+ */
+ rc = llog_cat_process(env, ctxt->loc_handle, changelog_detect_orphan_cb,
+ &changelog_orphan, 0, 0);
+ if (rc < 0) {
+ CERROR("%s: changelog detect orphan failed: rc = %d\n",
+ obd->obd_name, rc);
+ GOTO(out_uclose, rc);
+ }
+ rc = llog_cat_process(env, uctxt->loc_handle,
+ changelog_user_detect_orphan_cb,
+ &user_orphan, 0, 0);
+ if (rc < 0) {
+ CERROR("%s: changelog user detect orphan failed: rc = %d\n",
+ obd->obd_name, rc);
+ GOTO(out_uclose, rc);
+ }
+ if (unlikely(changelog_orphan.index < user_orphan.index)) {
+ struct changelog_cancel_cookie cl_cookie = {
+ .endrec = user_orphan.index,
+ .mdd = mdd,
+ };
+
+ CWARN("%s : orphan changelog records found, starting from "
+ "index %llu to index %llu, being cleared now\n",
+ obd->obd_name, changelog_orphan.index, user_orphan.index);
+
+ /* XXX we may need to run end of purge as a separate thread */
+ rc = llog_changelog_cancel(env, ctxt, &cl_cookie);
+ if (rc < 0) {
+ CERROR("%s: purge of changelog orphan records failed: "
+ "rc = %d\n", obd->obd_name, rc);
+ GOTO(out_uclose, rc);
+ }
+ }
+
llog_ctxt_put(ctxt);
llog_ctxt_put(uctxt);
RETURN(0);
spin_lock_init(&mdd->mdd_cl.mc_user_lock);
mdd->mdd_cl.mc_lastuser = 0;
+ /* ensure a GC check will, and a thread run may, occur upon start */
+ mdd->mdd_cl.mc_gc_time = 0;
+ mdd->mdd_cl.mc_gc_task = MDD_CHLG_GC_NONE;
+
rc = mdd_changelog_llog_init(env, mdd);
if (rc) {
CERROR("%s: changelog setup during init failed: rc = %d\n",
mdd->mdd_cl.mc_flags = 0;
+again:
+ /* stop GC-thread if running */
+ spin_lock(&mdd->mdd_cl.mc_lock);
+ if (likely(mdd->mdd_cl.mc_gc_task == MDD_CHLG_GC_NONE)) {
+ /* avoid any attempt to run a GC-thread */
+ mdd->mdd_cl.mc_gc_task = current;
+ spin_unlock(&mdd->mdd_cl.mc_lock);
+ } else {
+ struct task_struct *gc_task;
+
+ if (unlikely(mdd->mdd_cl.mc_gc_task == MDD_CHLG_GC_NEED ||
+ mdd->mdd_cl.mc_gc_task == MDD_CHLG_GC_START)) {
+ /* need to wait for birthing GC-thread to be started
+ * and to have set mc_gc_task to itself
+ */
+ spin_unlock(&mdd->mdd_cl.mc_lock);
+ /* Add a tiny sleep */
+ schedule_timeout_uninterruptible(1);
+ /* go back to fully check if GC-thread has started or
+ * even already exited or if a new one is starting...
+ */
+ goto again;
+ }
+ /* take a reference on task_struct to avoid it to be freed
+ * upon exit
+ */
+ gc_task = mdd->mdd_cl.mc_gc_task;
+ get_task_struct(gc_task);
+ spin_unlock(&mdd->mdd_cl.mc_lock);
+ kthread_stop(gc_task);
+ put_task_struct(gc_task);
+ }
+
ctxt = llog_get_context(obd, LLOG_CHANGELOG_ORIG_CTXT);
if (ctxt) {
llog_cat_close(env, ctxt->loc_handle);
struct obd_device *obd = mdd2obd_dev(mdd);
struct llog_ctxt *ctxt;
long long unsigned cur;
+ struct changelog_cancel_cookie cookie;
int rc;
ctxt = llog_get_context(obd, LLOG_CHANGELOG_ORIG_CTXT);
changed since the last purge) */
mdd->mdd_cl.mc_starttime = ktime_get();
- rc = llog_cancel(env, ctxt, (struct llog_cookie *)&endrec, 0);
+ cookie.endrec = endrec;
+ cookie.mdd = mdd;
+ rc = llog_changelog_cancel(env, ctxt, &cookie);
out:
llog_ctxt_put(ctxt);
return rc;
rec->cr.cr_namelen);
rec->cr_hdr.lrh_type = CHANGELOG_REC;
rec->cr.cr_time = cl_time();
- spin_lock(&mdd->mdd_cl.mc_lock);
- rec->cr.cr_index = ++mdd->mdd_cl.mc_index;
- spin_unlock(&mdd->mdd_cl.mc_lock);
ctxt = llog_get_context(obd, LLOG_CHANGELOG_ORIG_CTXT);
LASSERT(ctxt);
return -EPERM;
}
+int mdd_create(const struct lu_env *env, struct md_object *pobj,
+ const struct lu_name *lname, struct md_object *child,
+ struct md_op_spec *spec, struct md_attr *ma);
+static int mdd_obf_create(const struct lu_env *env, struct md_object *pobj,
+ const struct lu_name *lname, struct md_object *child,
+ struct md_op_spec *spec, struct md_attr *ma)
+{
+ if (spec->sp_cr_flags & MDS_OPEN_VOLATILE)
+ return mdd_create(env, pobj, lname, child, spec, ma);
+ RETURN(-EPERM);
+}
+
static struct md_dir_operations mdd_obf_dir_ops = {
.mdo_lookup = obf_lookup,
- .mdo_create = mdd_dummy_create,
+ .mdo_create = mdd_obf_create,
.mdo_rename = mdd_dummy_rename,
.mdo_link = mdd_dummy_link,
.mdo_unlink = mdd_dummy_unlink
return rc;
}
-
-static struct llog_operations hsm_actions_logops;
-
/**
* set llog methods and create LLOG_AGENT_ORIG_CTXT llog
* object in obd_device
obd->obd_lvfs_ctxt.dt = m->mdd_bottom;
rc = llog_setup(env, obd, &obd->obd_olg, LLOG_AGENT_ORIG_CTXT,
- obd, &hsm_actions_logops);
+ obd, &llog_common_cat_ops);
if (rc) {
CERROR("%s: hsm actions llog setup failed: rc = %d\n",
obd->obd_name, rc);
lfsck_degister(env, m->mdd_bottom);
mdd_hsm_actions_llog_fini(env, m);
mdd_changelog_fini(env, m);
- orph_index_fini(env, m);
+ mdd_orphan_index_fini(env, m);
mdd_dot_lustre_cleanup(env, m);
if (mdd2obd_dev(m)->u.obt.obt_nodemap_config_file) {
nm_config_file_deregister_tgt(env,
switch (cfg->lcfg_command) {
case LCFG_PARAM: {
- struct obd_device *obd = mdd2obd_dev(m);
+ ssize_t count;
- rc = class_process_proc_param(PARAM_MDD, obd->obd_vars, cfg, m);
- if (rc > 0 || rc == -ENOSYS)
+ count = class_modify_config(cfg, PARAM_MDD, &m->mdd_kobj);
+ rc = count > 0 ? 0 : count;
+ if (rc)
/* we don't understand; pass it on */
rc = next->ld_ops->ldo_process_config(env, next, cfg);
break;
break;
case LCFG_PRE_CLEANUP:
rc = next->ld_ops->ldo_process_config(env, next, cfg);
- mdd_generic_thread_stop(&m->mdd_orph_cleanup_thread);
+ mdd_generic_thread_stop(&m->mdd_orphan_cleanup_thread);
break;
case LCFG_CLEANUP:
rc = next->ld_ops->ldo_process_config(env, next, cfg);
}
static int mdd_recovery_complete(const struct lu_env *env,
- struct lu_device *d)
+ struct lu_device *d)
{
- struct mdd_device *mdd = lu2mdd_dev(d);
+ struct mdd_device *mdd = lu2mdd_dev(d);
struct lu_device *next;
- int rc;
- ENTRY;
+ int rc;
+ ENTRY;
- LASSERT(mdd != NULL);
+ LASSERT(mdd != NULL);
next = &mdd->mdd_child->dd_lu_dev;
- /* XXX: orphans handling. */
if (!mdd->mdd_bottom->dd_rdonly)
mdd_orphan_cleanup(env, mdd);
- rc = next->ld_ops->ldo_recovery_complete(env, next);
+ rc = next->ld_ops->ldo_recovery_complete(env, next);
- RETURN(rc);
+ RETURN(rc);
}
int mdd_local_file_create(const struct lu_env *env, struct mdd_device *mdd,
mdd->mdd_root_fid = fid;
}
- rc = orph_index_init(env, mdd);
+ rc = mdd_orphan_index_init(env, mdd);
if (rc < 0)
GOTO(out_dot, rc);
out_changelog:
mdd_changelog_fini(env, mdd);
out_orph:
- orph_index_fini(env, mdd);
+ mdd_orphan_index_fini(env, mdd);
out_dot:
if (mdd_seq_site(mdd)->ss_node_id == 0)
mdd_dot_lustre_cleanup(env, mdd);
ENTRY;
- rc = mdd_child_ops(mdd)->dt_statfs(env, mdd->mdd_child, sfs);
+ rc = mdd_child_ops(mdd)->dt_statfs(env, mdd->mdd_child, sfs, NULL);
sfs->os_namelen = min_t(__u32, sfs->os_namelen, NAME_MAX);
RETURN(rc);
}
-static struct obd_ops mdd_obd_device_ops = {
+static const struct obd_ops mdd_obd_device_ops = {
.o_owner = THIS_MODULE,
.o_connect = mdd_obd_connect,
.o_disconnect = mdd_obd_disconnect,
RETURN(-ENOMEM);
}
- /* Assume we want it on since somebody registered */
- rc = mdd_changelog_on(env, mdd);
- if (rc)
- GOTO(out, rc);
+ CFS_RACE(CFS_FAIL_CHLOG_USER_REG_UNREG_RACE);
rec->cur_hdr.lrh_len = sizeof(*rec);
rec->cur_hdr.lrh_type = CHANGELOG_USER_REC;
GOTO(out, rc = -EOVERFLOW);
}
*id = rec->cur_id = ++mdd->mdd_cl.mc_lastuser;
+ mdd->mdd_cl.mc_users++;
rec->cur_endrec = mdd->mdd_cl.mc_index;
- rec->cur_time = (__u32)get_seconds();
+ rec->cur_time = (__u32)ktime_get_real_seconds();
if (OBD_FAIL_CHECK(OBD_FAIL_TIME_IN_CHLOG_USER))
rec->cur_time = 0;
spin_unlock(&mdd->mdd_cl.mc_user_lock);
rc = llog_cat_add(env, ctxt->loc_handle, &rec->cur_hdr, NULL);
+ if (rc) {
+ CWARN("%s: Failed to register changelog user %d: rc=%d\n",
+ mdd2obd_dev(mdd)->obd_name, *id, rc);
+ spin_lock(&mdd->mdd_cl.mc_user_lock);
+ mdd->mdd_cl.mc_users--;
+ spin_unlock(&mdd->mdd_cl.mc_user_lock);
+ GOTO(out, rc);
+ }
CDEBUG(D_IOCTL, "Registered changelog user %d\n", *id);
+
+ /* Assume we want it on since somebody registered */
+ rc = mdd_changelog_on(env, mdd);
+ if (rc)
+ GOTO(out, rc);
+
out:
OBD_FREE_PTR(rec);
llog_ctxt_put(ctxt);
}
struct mdd_changelog_user_purge {
+ struct mdd_device *mcup_mdd;
__u32 mcup_id;
__u32 mcup_usercount;
__u64 mcup_minrec;
if (rc == 0) {
mcup->mcup_found = true;
mcup->mcup_usercount--;
+ spin_lock(&mcup->mcup_mdd->mdd_cl.mc_user_lock);
+ mcup->mcup_mdd->mdd_cl.mc_users--;
+ spin_unlock(&mcup->mcup_mdd->mdd_cl.mc_user_lock);
}
RETURN(rc);
struct mdd_device *mdd, __u32 id)
{
struct mdd_changelog_user_purge mcup = {
+ .mcup_mdd = mdd,
.mcup_id = id,
.mcup_found = false,
.mcup_usercount = 0,
mdd_changelog_user_purge_cb, &mcup,
0, 0);
+ OBD_FAIL_TIMEOUT(OBD_FAIL_LLOG_PURGE_DELAY, cfs_fail_val);
+
+ if ((rc == 0) && (mcup.mcup_usercount == 0)) {
+ spin_lock(&mdd->mdd_cl.mc_user_lock);
+ if (mdd->mdd_cl.mc_users == 0) {
+ /* No more users; turn changelogs off */
+ CDEBUG(D_IOCTL, "turning off changelogs\n");
+ rc = mdd_changelog_off(env, mdd);
+ }
+ spin_unlock(&mdd->mdd_cl.mc_user_lock);
+ }
+
if ((rc == 0) && mcup.mcup_found) {
CDEBUG(D_IOCTL, "%s: Purging changelog entries for user %d "
"record=%llu\n",
GOTO(out, rc = -ENOENT);
}
- if ((rc == 0) && (mcup.mcup_usercount == 0)) {
- /* No more users; turn changelogs off */
- CDEBUG(D_IOCTL, "turning off changelogs\n");
- rc = mdd_changelog_off(env, mdd);
- }
+ CFS_RACE(CFS_FAIL_CHLOG_USER_REG_UNREG_RACE);
EXIT;
out:
*/
rec->cur_endrec = mcuc->mcuc_endrec;
- rec->cur_time = (__u32)get_seconds();
+ rec->cur_time = (__u32)ktime_get_real_seconds();
if (OBD_FAIL_CHECK(OBD_FAIL_TIME_IN_CHLOG_USER))
rec->cur_time = 0;
if (unlikely(!barrier_entry(mdd->mdd_bottom)))
RETURN(-EINPROGRESS);
- rc = mdd_changelog_user_purge(env, mdd, data->ioc_u32_1);
+ /* explicitly clear changelog first, to protect from crash in
+ * the middle of purge that would lead to unregistered consumer
+ * but pending changelog entries
+ */
+ rc = mdd_changelog_clear(env, mdd, data->ioc_u32_1, 0);
+ if (!rc)
+ rc = mdd_changelog_user_purge(env,
+ mdd, data->ioc_u32_1);
+
barrier_exit(mdd->mdd_bottom);
break;
default:
if (rc)
return rc;
- changelog_orig_logops = llog_osd_ops;
- changelog_orig_logops.lop_cancel = llog_changelog_cancel;
- changelog_orig_logops.lop_add = llog_cat_add_rec;
- changelog_orig_logops.lop_declare_add = llog_cat_declare_add_rec;
-
- hsm_actions_logops = llog_osd_ops;
- hsm_actions_logops.lop_add = llog_cat_add_rec;
- hsm_actions_logops.lop_declare_add = llog_cat_declare_add_rec;
+ changelog_orig_logops = llog_common_cat_ops;
+ changelog_orig_logops.lop_write_rec = mdd_changelog_write_rec;
- rc = class_register_type(&mdd_obd_device_ops, NULL, true, NULL,
+ rc = class_register_type(&mdd_obd_device_ops, NULL, false, NULL,
LUSTRE_MDD_NAME, &mdd_device_type);
if (rc)
lu_kmem_fini(mdd_caches);