--- /dev/null
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License version 2 for more details. A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2016, Intel Corporation.
+ *
+ * lustre/mgs/mgs_barrier.c
+ *
+ * Author: Fan, Yong <fan.yong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_MGS
+#define D_MGS D_CONFIG
+
+#include <lustre_ioctl.h>
+#include <lustre_swab.h>
+#include <lustre/lustre_barrier_user.h>
+
+#include "mgs_internal.h"
+
+/**
+ * Handle the barrier lock glimpse reply.
+ *
+ * The barrier lock glimpse reply contains the target MDT's index and
+ * the barrier operation status on such MDT. With such infomation. If
+ * the MDT given barrier status is the expected one, then set related
+ * 'fsdb''s barrier bitmap; otherwise record the failure or status.
+ *
+ * \param[in] env pointer to the thread context
+ * \param[in] req pointer to the glimpse callback RPC request
+ * \param[in] data pointer the async glimpse callback data
+ * \param[in] rc the glimpse callback RPC return value
+ *
+ * \retval 0 for success
+ * \retval negative error number on failure
+ */
+static int mgs_barrier_gl_interpret_reply(const struct lu_env *env,
+ struct ptlrpc_request *req,
+ void *data, int rc)
+{
+ struct ldlm_cb_async_args *ca = data;
+ struct fs_db *fsdb = ca->ca_set_arg->gl_interpret_data;
+ struct barrier_lvb *lvb;
+ ENTRY;
+
+ if (rc) {
+ if (rc == -ENODEV)
+ /* The lock is useless, cancel it. */
+ ldlm_lock_cancel(ca->ca_lock);
+
+ GOTO(out, rc);
+ }
+
+ lvb = req_capsule_server_swab_get(&req->rq_pill, &RMF_DLM_LVB,
+ lustre_swab_barrier_lvb);
+ if (!lvb)
+ GOTO(out, rc = -EPROTO);
+
+ if (lvb->lvb_status == fsdb->fsdb_barrier_expected) {
+ if (unlikely(lvb->lvb_index > INDEX_MAP_SIZE))
+ rc = -EINVAL;
+ else
+ set_bit(lvb->lvb_index, fsdb->fsdb_barrier_map);
+ } else if (likely(!test_bit(lvb->lvb_index, fsdb->fsdb_barrier_map))) {
+ fsdb->fsdb_barrier_result = lvb->lvb_status;
+ }
+
+ GOTO(out, rc);
+
+out:
+ if (rc)
+ fsdb->fsdb_barrier_result = rc;
+
+ return rc;
+}
+
+/**
+ * Send glimpse callback to the barrier locks holders.
+ *
+ * The glimpse callback takes the current barrier status. The barrier locks
+ * holders (on the MDTs) will take related barrier actions according to the
+ * given barrier status, then return their local barrier status.
+ *
+ * \param[in] env pointer to the thread context
+ * \param[in] mgs pointer to the MGS device
+ * \param[in] fsdb pointer the barrier 'fsdb'
+ * \param[in] timeout indicate when the barrier will be expired
+ * \param[in] expected the expected barrier status on remote servers (MDTs)
+ *
+ * \retval positive number for unexpected barrier status
+ * \retval 0 for success
+ * \retval negative error number on failure
+ */
+static int mgs_barrier_glimpse_lock(const struct lu_env *env,
+ struct mgs_device *mgs,
+ struct fs_db *fsdb,
+ __u32 timeout, __u32 expected)
+{
+ union ldlm_gl_desc *desc = &mgs_env_info(env)->mgi_gl_desc;
+ struct ldlm_res_id res_id;
+ struct ldlm_resource *res;
+ struct ldlm_glimpse_work *work;
+ struct ldlm_glimpse_work *tmp;
+ struct list_head gl_list = LIST_HEAD_INIT(gl_list);
+ struct list_head *pos;
+ int i;
+ int rc;
+ ENTRY;
+
+ LASSERT(fsdb->fsdb_mdt_count > 0);
+
+ rc = mgc_logname2resid(fsdb->fsdb_name, &res_id, CONFIG_T_BARRIER);
+ if (rc)
+ RETURN(rc);
+
+ res = ldlm_resource_get(mgs->mgs_obd->obd_namespace, NULL, &res_id,
+ LDLM_PLAIN, 0);
+ if (IS_ERR(res))
+ RETURN(PTR_ERR(res));
+
+ fsdb->fsdb_barrier_result = 0;
+ fsdb->fsdb_barrier_expected = expected;
+ desc->barrier_desc.lgbd_status = fsdb->fsdb_barrier_status;
+ desc->barrier_desc.lgbd_timeout = timeout;
+
+again:
+ list_for_each_entry(work, &gl_list, gl_list) {
+ if (!work->gl_lock)
+ break;
+
+ LDLM_LOCK_RELEASE(work->gl_lock);
+ work->gl_lock = NULL;
+ }
+
+ /* It is not big issue to alloc more work item than needed. */
+ for (i = 0; i < fsdb->fsdb_mdt_count; i++) {
+ OBD_ALLOC_PTR(work);
+ if (!work)
+ GOTO(out, rc = -ENOMEM);
+
+ list_add_tail(&work->gl_list, &gl_list);
+ }
+
+ work = list_entry(gl_list.next, struct ldlm_glimpse_work, gl_list);
+
+ lock_res(res);
+ list_for_each(pos, &res->lr_granted) {
+ struct ldlm_lock *lock = list_entry(pos, struct ldlm_lock,
+ l_res_link);
+
+ work->gl_lock = LDLM_LOCK_GET(lock);
+ work->gl_flags = 0;
+ work->gl_desc = desc;
+ work->gl_interpret_reply = mgs_barrier_gl_interpret_reply;
+ work->gl_interpret_data = fsdb;
+
+ if (unlikely(work->gl_list.next == &gl_list)) {
+ if (likely(pos->next == &res->lr_granted))
+ break;
+
+ unlock_res(res);
+ /* The granted locks are more than the MDTs count. */
+ goto again;
+ }
+
+ work = list_entry(work->gl_list.next, struct ldlm_glimpse_work,
+ gl_list);
+ }
+ unlock_res(res);
+
+ /* The MDTs count may be more than the granted locks. */
+ list_for_each_entry_safe_reverse(work, tmp, &gl_list, gl_list) {
+ if (work->gl_lock)
+ break;
+
+ list_del(&work->gl_list);
+ OBD_FREE_PTR(work);
+ }
+
+ if (!list_empty(&gl_list))
+ rc = ldlm_glimpse_locks(res, &gl_list);
+ else
+ rc = -ENODEV;
+
+ GOTO(out, rc);
+
+out:
+ list_for_each_entry_safe(work, tmp, &gl_list, gl_list) {
+ list_del(&work->gl_list);
+ if (work->gl_lock)
+ LDLM_LOCK_RELEASE(work->gl_lock);
+ OBD_FREE_PTR(work);
+ }
+
+ ldlm_resource_putref(res);
+ if (!rc)
+ rc = fsdb->fsdb_barrier_result;
+
+ return rc;
+}
+
+static void mgs_barrier_bitmap_setup(struct mgs_device *mgs,
+ struct fs_db *b_fsdb,
+ const char *name)
+{
+ struct fs_db *c_fsdb;
+
+ c_fsdb = mgs_find_fsdb(mgs, name);
+ if (likely(c_fsdb)) {
+ memcpy(b_fsdb->fsdb_mdt_index_map,
+ c_fsdb->fsdb_mdt_index_map, INDEX_MAP_SIZE);
+ b_fsdb->fsdb_mdt_count = c_fsdb->fsdb_mdt_count;
+ mgs_put_fsdb(mgs, c_fsdb);
+ }
+}
+
+static bool mgs_barrier_done(struct fs_db *fsdb)
+{
+ int i;
+
+ for (i = 0; i < INDEX_MAP_SIZE * 8; i++) {
+ if (test_bit(i, fsdb->fsdb_mdt_index_map) &&
+ !test_bit(i, fsdb->fsdb_barrier_map))
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * Create the barrier for the given instance.
+ *
+ * We use two-phases barrier to guarantee that after the barrier setup:
+ * 1) All the server side pending async modification RPCs have been flushed.
+ * 2) Any subsequent modification will be blocked.
+ * 3) All async transactions on the MDTs have been committed.
+ *
+ * For phase1, we do the following:
+ *
+ * Firstly, it sets barrier flag on the instance that will block subsequent
+ * modifications from clients. (Note: server sponsored modification will be
+ * allowed for flush pending modifications)
+ *
+ * Secondly, it will flush all pending modification via dt_sync(), such as
+ * async OST-object destroy, async OST-object owner changes, and so on.
+ *
+ * If there are some on-handling clients sponsored modifications during the
+ * barrier creating, then related modifications may cause pending requests
+ * after the first dt_sync(), so call dt_sync() again after all on-handling
+ * modifications done.
+ *
+ * With the phase1 barrier set, all pending cross-servers modification RPCs
+ * have been flushed to remote servers, and any new modification will be
+ * blocked. But it does not guarantees that all the updates have been
+ * committed to storage on remote servers. So when all the instances have
+ * done phase1 barrier successfully, the MGS will notify all instances to
+ * do the phase2 barrier as following:
+ *
+ * Every barrier instance will call dt_sync() to make all async transactions
+ * to be committed locally.
+ *
+ * \param[in] env pointer to the thread context
+ * \param[in] mgs pointer to the MGS device
+ * \param[in] bc pointer the barrier control structure
+ *
+ * \retval 0 for success
+ * \retval negative error number on failure
+ */
+static int mgs_barrier_freeze(const struct lu_env *env,
+ struct mgs_device *mgs,
+ struct barrier_ctl *bc)
+{
+ char *name = mgs_env_info(env)->mgi_fsname;
+ struct fs_db *fsdb;
+ int rc = 0;
+ int left;
+ bool phase1 = true;
+ bool dirty = false;
+ ENTRY;
+
+ snprintf(name, sizeof(mgs_env_info(env)->mgi_fsname) - 1, "%s-%s",
+ bc->bc_name, BARRIER_FILENAME);
+
+ down_write(&mgs->mgs_barrier_rwsem);
+ mutex_lock(&mgs->mgs_mutex);
+
+ fsdb = mgs_find_fsdb(mgs, name);
+ if (!fsdb) {
+ mutex_unlock(&mgs->mgs_mutex);
+ up_write(&mgs->mgs_barrier_rwsem);
+
+ RETURN(-ENODEV);
+ }
+
+ if (unlikely(fsdb->fsdb_mdt_count == 0))
+ mgs_barrier_bitmap_setup(mgs, fsdb, bc->bc_name);
+
+ mutex_lock(&fsdb->fsdb_mutex);
+ mutex_unlock(&mgs->mgs_mutex);
+
+ switch (fsdb->fsdb_barrier_status) {
+ case BS_THAWING:
+ case BS_RESCAN:
+ rc = -EBUSY;
+ break;
+ case BS_FREEZING_P1:
+ case BS_FREEZING_P2:
+ rc = -EINPROGRESS;
+ break;
+ case BS_FROZEN:
+ if (cfs_time_before(cfs_time_current_sec(),
+ fsdb->fsdb_barrier_latest_create_time +
+ fsdb->fsdb_barrier_timeout)) {
+ rc = -EALREADY;
+ break;
+ }
+ case BS_INIT:
+ case BS_THAWED:
+ case BS_EXPIRED:
+ case BS_FAILED:
+ if (fsdb->fsdb_barrier_disabled) {
+ rc = -EOPNOTSUPP;
+ } else if (unlikely(fsdb->fsdb_mdt_count == 0)) {
+ rc = -ENODEV;
+ } else {
+ fsdb->fsdb_barrier_latest_create_time =
+ cfs_time_current_sec();
+ fsdb->fsdb_barrier_status = BS_FREEZING_P1;
+ if (bc->bc_timeout != 0)
+ fsdb->fsdb_barrier_timeout = bc->bc_timeout;
+ else
+ fsdb->fsdb_barrier_timeout =
+ BARRIER_TIMEOUT_DEFAULT;
+ memset(fsdb->fsdb_barrier_map, 0, INDEX_MAP_SIZE);
+ }
+ break;
+ default:
+ LCONSOLE_WARN("%s: found unexpected barrier status %u\n",
+ bc->bc_name, fsdb->fsdb_barrier_status);
+ rc = -EINVAL;
+ LBUG();
+ }
+
+ if (rc)
+ GOTO(out, rc);
+
+ left = fsdb->fsdb_barrier_timeout;
+
+again:
+ mutex_unlock(&fsdb->fsdb_mutex);
+ up_write(&mgs->mgs_barrier_rwsem);
+
+ CFS_FAIL_TIMEOUT(OBD_FAIL_BARRIER_DELAY, cfs_fail_val);
+
+ rc = mgs_barrier_glimpse_lock(env, mgs, fsdb, left,
+ phase1 ? BS_FREEZING_P1 : BS_FROZEN);
+ down_write(&mgs->mgs_barrier_rwsem);
+ mutex_lock(&fsdb->fsdb_mutex);
+
+ dirty = true;
+ left = fsdb->fsdb_barrier_latest_create_time +
+ fsdb->fsdb_barrier_timeout - cfs_time_current_sec();
+ if (left <= 0) {
+ fsdb->fsdb_barrier_status = BS_EXPIRED;
+
+ GOTO(out, rc = -ETIME);
+ }
+
+ LASSERTF(fsdb->fsdb_barrier_status ==
+ (phase1 ? BS_FREEZING_P1 : BS_FREEZING_P2),
+ "unexpected barrier status %u\n",
+ fsdb->fsdb_barrier_status);
+
+ if (rc == -ETIMEDOUT) {
+ fsdb->fsdb_barrier_status = BS_EXPIRED;
+ rc = -ETIME;
+ } else if (rc > 0) {
+ fsdb->fsdb_barrier_status = rc;
+ rc = -EREMOTE;
+ } else if (rc < 0) {
+ fsdb->fsdb_barrier_status = BS_FAILED;
+ } else if (mgs_barrier_done(fsdb)) {
+ if (phase1) {
+ fsdb->fsdb_barrier_status = BS_FREEZING_P2;
+ memset(fsdb->fsdb_barrier_map, 0,
+ INDEX_MAP_SIZE);
+ phase1 = false;
+
+ goto again;
+ } else {
+ fsdb->fsdb_barrier_status = BS_FROZEN;
+ }
+ } else {
+ fsdb->fsdb_barrier_status = BS_FAILED;
+ rc = -EREMOTE;
+ }
+
+ GOTO(out, rc);
+
+out:
+ mutex_unlock(&fsdb->fsdb_mutex);
+ up_write(&mgs->mgs_barrier_rwsem);
+ if (rc && dirty) {
+ memset(fsdb->fsdb_barrier_map, 0, INDEX_MAP_SIZE);
+ mgs_barrier_glimpse_lock(env, mgs, fsdb, 0, BS_THAWED);
+ }
+
+ mgs_put_fsdb(mgs, fsdb);
+
+ return rc;
+}
+
+static int mgs_barrier_thaw(const struct lu_env *env,
+ struct mgs_device *mgs,
+ struct barrier_ctl *bc)
+{
+ char *name = mgs_env_info(env)->mgi_fsname;
+ struct fs_db *fsdb;
+ int rc = 0;
+ ENTRY;
+
+ snprintf(name, sizeof(mgs_env_info(env)->mgi_fsname) - 1, "%s-%s",
+ bc->bc_name, BARRIER_FILENAME);
+
+ down_write(&mgs->mgs_barrier_rwsem);
+ mutex_lock(&mgs->mgs_mutex);
+
+ fsdb = mgs_find_fsdb(mgs, name);
+ if (!fsdb) {
+ mutex_unlock(&mgs->mgs_mutex);
+ up_write(&mgs->mgs_barrier_rwsem);
+
+ RETURN(-ENODEV);
+ }
+
+ if (unlikely(fsdb->fsdb_mdt_count == 0))
+ mgs_barrier_bitmap_setup(mgs, fsdb, bc->bc_name);
+
+ mutex_lock(&fsdb->fsdb_mutex);
+ mutex_unlock(&mgs->mgs_mutex);
+
+ switch (fsdb->fsdb_barrier_status) {
+ case BS_FREEZING_P1:
+ case BS_FREEZING_P2:
+ case BS_RESCAN:
+ rc = -EBUSY;
+ break;
+ case BS_INIT:
+ case BS_THAWED:
+ rc = -EALREADY;
+ break;
+ case BS_THAWING:
+ rc = -EINPROGRESS;
+ break;
+ case BS_FROZEN:
+ case BS_EXPIRED: /* The barrier on some MDT(s) may be expired,
+ * but may be not on others. Destory anyway. */
+ case BS_FAILED:
+ if (unlikely(fsdb->fsdb_mdt_count == 0)) {
+ rc = -ENODEV;
+ } else {
+ fsdb->fsdb_barrier_status = BS_THAWING;
+ memset(fsdb->fsdb_barrier_map, 0, INDEX_MAP_SIZE);
+ }
+ break;
+ default:
+ LCONSOLE_WARN("%s: found unexpected barrier status %u\n",
+ bc->bc_name, fsdb->fsdb_barrier_status);
+ rc = -EINVAL;
+ LBUG();
+ }
+
+ if (rc)
+ GOTO(out, rc);
+
+ mutex_unlock(&fsdb->fsdb_mutex);
+ up_write(&mgs->mgs_barrier_rwsem);
+
+ CFS_FAIL_TIMEOUT(OBD_FAIL_BARRIER_DELAY, cfs_fail_val);
+
+ rc = mgs_barrier_glimpse_lock(env, mgs, fsdb, 0, BS_THAWED);
+ down_write(&mgs->mgs_barrier_rwsem);
+ mutex_lock(&fsdb->fsdb_mutex);
+
+ LASSERTF(fsdb->fsdb_barrier_status == BS_THAWING,
+ "unexpected barrier status %u\n",
+ fsdb->fsdb_barrier_status);
+
+ if (rc > 0) {
+ fsdb->fsdb_barrier_status = rc;
+ rc = -EREMOTE;
+ } else if (rc < 0) {
+ fsdb->fsdb_barrier_status = BS_FAILED;
+ } else if (mgs_barrier_done(fsdb)) {
+ fsdb->fsdb_barrier_status = BS_THAWED;
+ } else {
+ fsdb->fsdb_barrier_status = BS_FAILED;
+ rc = -EREMOTE;
+ }
+
+ GOTO(out, rc);
+
+out:
+ mutex_unlock(&fsdb->fsdb_mutex);
+ up_write(&mgs->mgs_barrier_rwsem);
+ mgs_put_fsdb(mgs, fsdb);
+
+ return rc;
+}
+
+static int mgs_barrier_stat(const struct lu_env *env,
+ struct mgs_device *mgs,
+ struct barrier_ctl *bc)
+{
+ char *name = mgs_env_info(env)->mgi_fsname;
+ struct fs_db *fsdb;
+ ENTRY;
+
+ snprintf(name, sizeof(mgs_env_info(env)->mgi_fsname) - 1, "%s-%s",
+ bc->bc_name, BARRIER_FILENAME);
+
+ mutex_lock(&mgs->mgs_mutex);
+
+ fsdb = mgs_find_fsdb(mgs, name);
+ if (fsdb) {
+ mutex_lock(&fsdb->fsdb_mutex);
+ mutex_unlock(&mgs->mgs_mutex);
+
+ bc->bc_status = fsdb->fsdb_barrier_status;
+ if (bc->bc_status == BS_FREEZING_P1 ||
+ bc->bc_status == BS_FREEZING_P2 ||
+ bc->bc_status == BS_FROZEN) {
+ if (cfs_time_before(cfs_time_current_sec(),
+ fsdb->fsdb_barrier_latest_create_time +
+ fsdb->fsdb_barrier_timeout))
+ bc->bc_timeout =
+ fsdb->fsdb_barrier_latest_create_time +
+ fsdb->fsdb_barrier_timeout -
+ cfs_time_current_sec();
+ else
+ bc->bc_status = fsdb->fsdb_barrier_status =
+ BS_EXPIRED;
+ }
+
+ mutex_unlock(&fsdb->fsdb_mutex);
+ mgs_put_fsdb(mgs, fsdb);
+ } else {
+ mutex_unlock(&mgs->mgs_mutex);
+
+ bc->bc_status = BS_INIT;
+ }
+
+ RETURN(0);
+}
+
+static int mgs_barrier_rescan(const struct lu_env *env,
+ struct mgs_device *mgs,
+ struct barrier_ctl *bc)
+{
+ char *name = mgs_env_info(env)->mgi_fsname;
+ struct fs_db *b_fsdb;
+ struct fs_db *c_fsdb;
+ int rc = 0;
+ ENTRY;
+
+ down_write(&mgs->mgs_barrier_rwsem);
+ mutex_lock(&mgs->mgs_mutex);
+
+ c_fsdb = mgs_find_fsdb(mgs, bc->bc_name);
+ if (!c_fsdb || unlikely(c_fsdb->fsdb_mdt_count == 0)) {
+ mutex_unlock(&mgs->mgs_mutex);
+ up_write(&mgs->mgs_barrier_rwsem);
+
+ RETURN(-ENODEV);
+ }
+
+ snprintf(name, sizeof(mgs_env_info(env)->mgi_fsname) - 1, "%s-%s",
+ bc->bc_name, BARRIER_FILENAME);
+ b_fsdb = mgs_find_fsdb(mgs, name);
+ if (!b_fsdb) {
+ mutex_unlock(&mgs->mgs_mutex);
+ up_write(&mgs->mgs_barrier_rwsem);
+ mgs_put_fsdb(mgs, c_fsdb);
+
+ RETURN(-ENODEV);
+ }
+
+ mutex_lock(&b_fsdb->fsdb_mutex);
+ mutex_lock(&c_fsdb->fsdb_mutex);
+ mutex_unlock(&mgs->mgs_mutex);
+
+ switch (b_fsdb->fsdb_barrier_status) {
+ case BS_RESCAN:
+ rc = -EINPROGRESS;
+ break;
+ case BS_THAWING:
+ case BS_FREEZING_P1:
+ case BS_FREEZING_P2:
+ rc = -EBUSY;
+ break;
+ case BS_FROZEN:
+ if (cfs_time_before(cfs_time_current_sec(),
+ b_fsdb->fsdb_barrier_latest_create_time +
+ b_fsdb->fsdb_barrier_timeout)) {
+ rc = -EBUSY;
+ break;
+ }
+ case BS_INIT:
+ case BS_THAWED:
+ case BS_EXPIRED:
+ case BS_FAILED:
+ b_fsdb->fsdb_barrier_latest_create_time =
+ cfs_time_current_sec();
+ b_fsdb->fsdb_barrier_status = BS_RESCAN;
+ memcpy(b_fsdb->fsdb_mdt_index_map, c_fsdb->fsdb_mdt_index_map,
+ INDEX_MAP_SIZE);
+ memset(b_fsdb->fsdb_barrier_map, 0, INDEX_MAP_SIZE);
+ b_fsdb->fsdb_mdt_count = c_fsdb->fsdb_mdt_count;
+ break;
+ default:
+ LCONSOLE_WARN("%s: found unexpected barrier status %u\n",
+ bc->bc_name, b_fsdb->fsdb_barrier_status);
+ rc = -EINVAL;
+ LBUG();
+ }
+
+ mutex_unlock(&c_fsdb->fsdb_mutex);
+ mgs_put_fsdb(mgs, c_fsdb);
+
+ if (rc)
+ GOTO(out, rc);
+
+again:
+ mutex_unlock(&b_fsdb->fsdb_mutex);
+ up_write(&mgs->mgs_barrier_rwsem);
+ rc = mgs_barrier_glimpse_lock(env, mgs, b_fsdb, 0, BS_INIT);
+ down_write(&mgs->mgs_barrier_rwsem);
+ mutex_lock(&b_fsdb->fsdb_mutex);
+
+ LASSERTF(b_fsdb->fsdb_barrier_status == BS_RESCAN,
+ "unexpected barrier status %u\n",
+ b_fsdb->fsdb_barrier_status);
+
+ if (rc > 0) {
+ b_fsdb->fsdb_barrier_status = rc;
+ rc = -EREMOTE;
+ } else if (rc == -ETIMEDOUT &&
+ cfs_time_before(cfs_time_current_sec(),
+ b_fsdb->fsdb_barrier_latest_create_time +
+ bc->bc_timeout)) {
+ memset(b_fsdb->fsdb_barrier_map, 0, INDEX_MAP_SIZE);
+
+ goto again;
+ } else if (rc < 0 && rc != -ETIMEDOUT && rc != -ENODEV) {
+ b_fsdb->fsdb_barrier_status = BS_FAILED;
+ } else {
+ int i;
+
+ b_fsdb->fsdb_mdt_count = 0;
+ bc->bc_total = 0;
+ bc->bc_absence = 0;
+ rc = 0;
+ for (i = 0; i < INDEX_MAP_SIZE * 8; i++) {
+ if (test_bit(i, b_fsdb->fsdb_barrier_map)) {
+ b_fsdb->fsdb_mdt_count++;
+ } else if (test_bit(i, b_fsdb->fsdb_mdt_index_map)) {
+ b_fsdb->fsdb_mdt_count++;
+ bc->bc_absence++;
+ }
+ }
+
+ bc->bc_total = b_fsdb->fsdb_mdt_count;
+ memcpy(b_fsdb->fsdb_mdt_index_map,
+ b_fsdb->fsdb_barrier_map, INDEX_MAP_SIZE);
+ b_fsdb->fsdb_barrier_status = BS_INIT;
+ }
+
+ GOTO(out, rc);
+
+out:
+ mutex_unlock(&b_fsdb->fsdb_mutex);
+ up_write(&mgs->mgs_barrier_rwsem);
+ mgs_put_fsdb(mgs, b_fsdb);
+
+ return rc;
+}
+
+int mgs_iocontrol_barrier(const struct lu_env *env,
+ struct mgs_device *mgs,
+ struct obd_ioctl_data *data)
+{
+ struct barrier_ctl *bc = (struct barrier_ctl *)(data->ioc_inlbuf1);
+ int rc;
+ ENTRY;
+
+ if (unlikely(bc->bc_version != BARRIER_VERSION_V1))
+ RETURN(-EOPNOTSUPP);
+
+ if (unlikely(strnlen(bc->bc_name, sizeof(bc->bc_name)) > 8))
+ RETURN(-EINVAL);
+
+ switch (bc->bc_cmd) {
+ case BC_FREEZE:
+ rc = mgs_barrier_freeze(env, mgs, bc);
+ break;
+ case BC_THAW:
+ rc = mgs_barrier_thaw(env, mgs, bc);
+ break;
+ case BC_STAT:
+ rc = mgs_barrier_stat(env, mgs, bc);
+ break;
+ case BC_RESCAN:
+ rc = mgs_barrier_rescan(env, mgs, bc);
+ break;
+ default:
+ rc = -EINVAL;
+ break;
+ }
+
+ RETURN(rc);
+}
}
run_test 409 "Large amount of cross-MDTs hard links on the same file"
+prep_801() {
+ start_full_debug_logging
+ # cleanup unused barrier locks before test
+ do_facet mgs $LCTL barrier_rescan $FSNAME ||
+ error "Fail to prep barrier test env"
+}
+
+post_801() {
+ stop_full_debug_logging
+}
+
+test_801a() {
+ prep_801
+
+ #define OBD_FAIL_BARRIER_DELAY 0x2202
+ do_facet mgs $LCTL set_param fail_val=3 fail_loc=0x2202
+ do_facet mgs $LCTL barrier_freeze $FSNAME 10 &
+
+ sleep 1
+ local b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+ awk '/The barrier for/ { print $7 }')
+ [ "$b_status" = "'freezing_p1'" ] ||
+ error "(1) unexpected barrier status $b_status"
+
+ do_facet mgs $LCTL set_param fail_val=0 fail_loc=0
+ wait
+ b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+ awk '/The barrier for/ { print $7 }')
+ [ "$b_status" = "'frozen'" ] ||
+ error "(2) unexpected barrier status $b_status"
+
+ local expired=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+ awk '/will be expired/ { print $7 }')
+ echo "sleep $((expired + 3)) seconds, then the barrier will be expired"
+ sleep $((expired + 3))
+
+ b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+ awk '/The barrier for/ { print $7 }')
+ [ "$b_status" = "'expired'" ] ||
+ error "(3) unexpected barrier status $b_status"
+
+ do_facet mgs $LCTL barrier_freeze $FSNAME 10 ||
+ error "(4) fail to freeze barrier"
+
+ b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+ awk '/The barrier for/ { print $7 }')
+ [ "$b_status" = "'frozen'" ] ||
+ error "(5) unexpected barrier status $b_status"
+
+ #define OBD_FAIL_BARRIER_DELAY 0x2202
+ do_facet mgs $LCTL set_param fail_val=3 fail_loc=0x2202
+ do_facet mgs $LCTL barrier_thaw $FSNAME &
+
+ sleep 1
+ b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+ awk '/The barrier for/ { print $7 }')
+ [ "$b_status" = "'thawing'" ] ||
+ error "(6) unexpected barrier status $b_status"
+
+ do_facet mgs $LCTL set_param fail_val=0 fail_loc=0
+ wait
+ b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+ awk '/The barrier for/ { print $7 }')
+ [ "$b_status" = "'thawed'" ] ||
+ error "(7) unexpected barrier status $b_status"
+
+ #define OBD_FAIL_BARRIER_FAILURE 0x2203
+ do_facet $SINGLEMDS $LCTL set_param fail_loc=0x2203
+ do_facet mgs $LCTL barrier_freeze $FSNAME
+
+ b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+ awk '/The barrier for/ { print $7 }')
+ [ "$b_status" = "'failed'" ] ||
+ error "(8) unexpected barrier status $b_status"
+
+ do_facet $SINGLEMDS $LCTL set_param fail_loc=0
+ do_facet mgs $LCTL barrier_thaw $FSNAME
+
+ post_801
+}
+run_test 801a "write barrier user interfaces and stat machine"
+
+test_801b() {
+ prep_801
+
+ mkdir $DIR/$tdir || error "(1) fail to mkdir"
+ createmany -d $DIR/$tdir/d 6 || "(2) fail to mkdir"
+ touch $DIR/$tdir/d2/f10 || error "(3) fail to touch"
+ touch $DIR/$tdir/d3/f11 || error "(4) fail to touch"
+ touch $DIR/$tdir/d4/f12 || error "(5) fail to touch"
+
+ cancel_lru_locks mdc
+
+ # 180 seconds should be long enough
+ do_facet mgs $LCTL barrier_freeze $FSNAME 180
+
+ local b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+ awk '/The barrier for/ { print $7 }')
+ [ "$b_status" = "'frozen'" ] ||
+ error "(6) unexpected barrier status $b_status"
+
+ mkdir $DIR/$tdir/d0/d10 &
+ mkdir_pid=$!
+
+ touch $DIR/$tdir/d1/f13 &
+ touch_pid=$!
+
+ ln $DIR/$tdir/d2/f10 $DIR/$tdir/d2/f14 &
+ ln_pid=$!
+
+ mv $DIR/$tdir/d3/f11 $DIR/$tdir/d3/f15 &
+ mv_pid=$!
+
+ rm -f $DIR/$tdir/d4/f12 &
+ rm_pid=$!
+
+ stat $DIR/$tdir/d5 || error "(7) stat should succeed"
+
+ # To guarantee taht the 'stat' is not blocked
+ b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+ awk '/The barrier for/ { print $7 }')
+ [ "$b_status" = "'frozen'" ] ||
+ error "(8) unexpected barrier status $b_status"
+
+ # let above commands to run at background
+ sleep 5
+
+ ps -p $mkdir_pid || error "(9) mkdir should be blocked"
+ ps -p $touch_pid || error "(10) touch should be blocked"
+ ps -p $ln_pid || error "(11) link should be blocked"
+ ps -p $mv_pid || error "(12) rename should be blocked"
+ ps -p $rm_pid || error "(13) unlink should be blocked"
+
+ b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+ awk '/The barrier for/ { print $7 }')
+ [ "$b_status" = "'frozen'" ] ||
+ error "(14) unexpected barrier status $b_status"
+
+ do_facet mgs $LCTL barrier_thaw $FSNAME
+ b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+ awk '/The barrier for/ { print $7 }')
+ [ "$b_status" = "'thawed'" ] ||
+ error "(15) unexpected barrier status $b_status"
+
+ wait $mkdir_pid || error "(16) mkdir should succeed"
+ wait $touch_pid || error "(17) touch should succeed"
+ wait $ln_pid || error "(18) link should succeed"
+ wait $mv_pid || error "(19) rename should succeed"
+ wait $rm_pid || error "(20) unlink should succeed"
+
+ post_801
+}
+run_test 801b "modification will be blocked by write barrier"
+
+test_801c() {
+ [[ $MDSCOUNT -lt 2 ]] && skip "needs >= 2 MDTs" && return
+
+ prep_801
+
+ stop mds2 || error "(1) Fail to stop mds2"
+
+ do_facet mgs $LCTL barrier_freeze $FSNAME 30
+
+ local b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+ awk '/The barrier for/ { print $7 }')
+ [ "$b_status" = "'expired'" -o "$b_status" = "'failed'" ] || {
+ do_facet mgs $LCTL barrier_thaw $FSNAME
+ error "(2) unexpected barrier status $b_status"
+ }
+
+ do_facet mgs $LCTL barrier_rescan $FSNAME ||
+ error "(3) Fail to rescan barrier bitmap"
+
+ do_facet mgs $LCTL barrier_freeze $FSNAME 10
+
+ b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+ awk '/The barrier for/ { print $7 }')
+ [ "$b_status" = "'frozen'" ] ||
+ error "(4) unexpected barrier status $b_status"
+
+ do_facet mgs $LCTL barrier_thaw $FSNAME
+ b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+ awk '/The barrier for/ { print $7 }')
+ [ "$b_status" = "'thawed'" ] ||
+ error "(5) unexpected barrier status $b_status"
+
+ local devname=$(mdsdevname 2)
+
+ start mds2 $devname $MDS_MOUNT_OPTS || error "(6) Fail to start mds2"
+
+ do_facet mgs $LCTL barrier_rescan $FSNAME ||
+ error "(7) Fail to rescan barrier bitmap"
+
+ post_801
+}
+run_test 801c "rescan barrier bitmap"
+
#
# tests that do cleanup/setup should be run at the end
#
#include <lnet/lnetctl.h>
#include <lustre/lustreapi.h>
#include <lustre_param.h>
+#include <lustre/lustre_barrier_user.h>
#define MAX_STRING_SIZE 128
return rc;
}
+static const char *barrier_status2name(enum barrier_status status)
+{
+ switch (status) {
+ case BS_INIT:
+ return "init";
+ case BS_FREEZING_P1:
+ return "freezing_p1";
+ case BS_FREEZING_P2:
+ return "freezing_p2";
+ case BS_FROZEN:
+ return "frozen";
+ case BS_THAWING:
+ return "thawing";
+ case BS_THAWED:
+ return "thawed";
+ case BS_FAILED:
+ return "failed";
+ case BS_EXPIRED:
+ return "expired";
+ case BS_RESCAN:
+ return "rescan";
+ default:
+ return "unknown";
+ }
+}
+
+int jt_barrier_freeze(int argc, char **argv)
+{
+ struct obd_ioctl_data data;
+ char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
+ struct barrier_ctl bc;
+ int rc;
+
+ if (argc < 2 || argc > 3)
+ return CMD_HELP;
+
+ memset(&data, 0, sizeof(data));
+ rc = data.ioc_dev = get_mgs_device();
+ if (rc < 0)
+ return rc;
+
+ memset(&bc, 0, sizeof(bc));
+ bc.bc_version = BARRIER_VERSION_V1;
+ bc.bc_cmd = BC_FREEZE;
+ if (argc == 3)
+ bc.bc_timeout = atoi(argv[2]);
+ if (bc.bc_timeout == 0)
+ bc.bc_timeout = BARRIER_TIMEOUT_DEFAULT;
+
+ if (strlen(argv[1]) > 8) {
+ fprintf(stderr, "%s: fsname name %s is too long. "
+ "It should not exceed 8.\n", argv[0], argv[1]);
+ return -EINVAL;
+ }
+
+ strncpy(bc.bc_name, argv[1], sizeof(bc.bc_name));
+ data.ioc_inlbuf1 = (char *)&bc;
+ data.ioc_inllen1 = sizeof(bc);
+ memset(buf, 0, sizeof(rawbuf));
+ rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+ if (rc) {
+ fprintf(stderr, "Fail to pack ioctl data: rc = %d.\n", rc);
+ return rc;
+ }
+
+ rc = l_ioctl(OBD_DEV_ID, OBD_IOC_BARRIER, buf);
+ if (rc < 0)
+ fprintf(stderr, "Fail to freeze barrier for %s: %s\n",
+ argv[1], strerror(errno));
+
+ return rc;
+}
+
+int jt_barrier_thaw(int argc, char **argv)
+{
+ struct obd_ioctl_data data;
+ char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
+ struct barrier_ctl bc;
+ int rc;
+
+ if (argc != 2)
+ return CMD_HELP;
+
+ memset(&data, 0, sizeof(data));
+ rc = data.ioc_dev = get_mgs_device();
+ if (rc < 0)
+ return rc;
+
+ memset(&bc, 0, sizeof(bc));
+ bc.bc_version = BARRIER_VERSION_V1;
+ bc.bc_cmd = BC_THAW;
+
+ if (strlen(argv[1]) > 8) {
+ fprintf(stderr, "fsname name %s is too long. "
+ "It should not exceed 8.\n", argv[1]);
+ return -EINVAL;
+ }
+
+ strncpy(bc.bc_name, argv[1], sizeof(bc.bc_name));
+ data.ioc_inlbuf1 = (char *)&bc;
+ data.ioc_inllen1 = sizeof(bc);
+ memset(buf, 0, sizeof(rawbuf));
+ rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+ if (rc) {
+ fprintf(stderr, "Fail to pack ioctl data: rc = %d.\n", rc);
+ return rc;
+ }
+
+ rc = l_ioctl(OBD_DEV_ID, OBD_IOC_BARRIER, buf);
+ if (rc < 0)
+ fprintf(stderr, "Fail to thaw barrier for %s: %s\n",
+ argv[1], strerror(errno));
+
+ return rc;
+}
+
+int jt_barrier_stat(int argc, char **argv)
+{
+ struct obd_ioctl_data data;
+ char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
+ struct barrier_ctl bc;
+ int rc;
+
+ if (argc != 2)
+ return CMD_HELP;
+
+ memset(&data, 0, sizeof(data));
+ rc = data.ioc_dev = get_mgs_device();
+ if (rc < 0)
+ return rc;
+
+ memset(&bc, 0, sizeof(bc));
+ bc.bc_version = BARRIER_VERSION_V1;
+ bc.bc_cmd = BC_STAT;
+
+ if (strlen(argv[1]) > 8) {
+ fprintf(stderr, "fsname name %s is too long. "
+ "It should not exceed 8.\n", argv[1]);
+ return -EINVAL;
+ }
+
+ strncpy(bc.bc_name, argv[1], sizeof(bc.bc_name));
+ data.ioc_inlbuf1 = (char *)&bc;
+ data.ioc_inllen1 = sizeof(bc);
+ memset(buf, 0, sizeof(rawbuf));
+ rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+ if (rc) {
+ fprintf(stderr, "Fail to pack ioctl data: rc = %d.\n", rc);
+ return rc;
+ }
+
+ rc = l_ioctl(OBD_DEV_ID, OBD_IOC_BARRIER, buf);
+ if (rc < 0) {
+ fprintf(stderr, "Fail to query barrier for %s: %s\n",
+ argv[1], strerror(errno));
+ } else {
+ obd_ioctl_unpack(&data, buf, sizeof(rawbuf));
+ printf("The barrier for %s is in '%s'\n",
+ argv[1], barrier_status2name(bc.bc_status));
+ if (bc.bc_status == BS_FREEZING_P1 ||
+ bc.bc_status == BS_FREEZING_P2 ||
+ bc.bc_status == BS_FROZEN)
+ printf("The barrier will be expired after %d "
+ "seconds\n", bc.bc_timeout);
+ }
+
+ return rc;
+}
+
+int jt_barrier_rescan(int argc, char **argv)
+{
+ struct obd_ioctl_data data;
+ char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
+ struct barrier_ctl bc;
+ int rc;
+
+ if (argc < 2 || argc > 3)
+ return CMD_HELP;
+
+ memset(&data, 0, sizeof(data));
+ rc = data.ioc_dev = get_mgs_device();
+ if (rc < 0)
+ return rc;
+
+ memset(&bc, 0, sizeof(bc));
+ bc.bc_version = BARRIER_VERSION_V1;
+ bc.bc_cmd = BC_RESCAN;
+ if (argc == 3)
+ bc.bc_timeout = atoi(argv[2]);
+ if (bc.bc_timeout == 0)
+ bc.bc_timeout = BARRIER_TIMEOUT_DEFAULT;
+
+ if (strlen(argv[1]) > 8) {
+ fprintf(stderr, "fsname name %s is too long. "
+ "It should not exceed 8.\n", argv[1]);
+ return -EINVAL;
+ }
+
+ strncpy(bc.bc_name, argv[1], sizeof(bc.bc_name));
+ data.ioc_inlbuf1 = (char *)&bc;
+ data.ioc_inllen1 = sizeof(bc);
+ memset(buf, 0, sizeof(rawbuf));
+ rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+ if (rc) {
+ fprintf(stderr, "Fail to pack ioctl data: rc = %d.\n", rc);
+ return rc;
+ }
+
+ rc = l_ioctl(OBD_DEV_ID, OBD_IOC_BARRIER, buf);
+ if (rc < 0) {
+ fprintf(stderr, "Fail to rescan barrier bitmap for %s: %s\n",
+ argv[1], strerror(errno));
+ } else {
+ obd_ioctl_unpack(&data, buf, sizeof(rawbuf));
+ printf("%u of %u MDT(s) in the filesystem %s are inactive\n",
+ bc.bc_absence, bc.bc_total, argv[1]);
+ }
+
+ return rc;
+}
+
int jt_get_obj_version(int argc, char **argv)
{
struct lu_fid fid;