Whamcloud - gitweb
LU-8900 snapshot: user interface for write barrier on MDT 65/24265/19
authorFan Yong <fan.yong@intel.com>
Fri, 4 Nov 2016 10:29:14 +0000 (18:29 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Thu, 23 Mar 2017 01:38:42 +0000 (01:38 +0000)
The user can control the write barrier on MDTs via lctl commands:

Freeze barrier:
lctl barrier_freeze <fsname> [timeout (in second)]
NOTE: the default timeout value is 30 (seconds).

Thaw barrier:
lctl barrier_thaw <fsname>

Query barrier:
lctl barrier_stat <fsname>

Rescan barrier bitmap:
lctl barrier_rescan <fsname> [timeout (in second)]
NOTE: the default timeout value is 30 (seconds).

Signed-off-by: Fan Yong <fan.yong@intel.com>
Change-Id: Id953203fc3ce6ebbce9f1ae0511fbe2b3813bb9f
Reviewed-on: https://review.whamcloud.com/24265
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Niu Yawei <yawei.niu@intel.com>
Reviewed-by: Lai Siyao <lai.siyao@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
12 files changed:
libcfs/include/libcfs/util/list.h
lustre/include/lustre/lustre_barrier_user.h
lustre/include/lustre_ioctl.h
lustre/mgs/Makefile.in
lustre/mgs/mgs_barrier.c [new file with mode: 0644]
lustre/mgs/mgs_handler.c
lustre/mgs/mgs_internal.h
lustre/target/barrier.c
lustre/tests/sanity.sh
lustre/utils/lctl.c
lustre/utils/obd.c
lustre/utils/obdctl.h

index 2372f5b..ef69efe 100644 (file)
@@ -483,4 +483,17 @@ static inline void hlist_add_after(struct hlist_node *n,
             &pos->member != (head);                                         \
             pos = n, n = list_entry(n->member.next, typeof(*n), member))
 
+/**
+ * Iterate backwards over a list of given type safely against removal of entry
+ * \param pos        the type * to use as a loop counter.
+ * \param n          another type * to use as temporary storage
+ * \param head       the head for your list.
+ * \param member     the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_safe_reverse(pos, n, head, member)         \
+       for (pos = list_entry((head)->prev, typeof(*pos), member),      \
+               n = list_entry(pos->member.prev, typeof(*pos), member); \
+            &pos->member != (head);                                    \
+            pos = n, n = list_entry(n->member.prev, typeof(*n), member))
+
 #endif /* __LIBCFS_UTIL_LIST_H__ */
index ba42281..e69bdc2 100644 (file)
 #ifndef _LUSTRE_BARRIER_USER_H
 # define _LUSTRE_BARRIER_USER_H
 
+#include <lustre/lustre_user.h>
+
+#define BARRIER_VERSION_V1     1
+#define BARRIER_TIMEOUT_DEFAULT        30
+
+enum barrier_commands {
+       BC_FREEZE       = 1,
+       BC_THAW         = 2,
+       BC_STAT         = 3,
+       BC_RESCAN       = 4,
+};
+
 enum barrier_status {
        BS_INIT         = 0,
        BS_FREEZING_P1  = 1,
@@ -43,4 +55,19 @@ enum barrier_status {
        BS_RESCAN       = 8,
 };
 
+struct barrier_ctl {
+       __u32   bc_version;
+       __u32   bc_cmd;
+       union {
+               __s32   bc_timeout;
+               __u32   bc_total;
+       };
+       union {
+               __u32   bc_status;
+               __u32   bc_absence;
+       };
+       char    bc_name[12];
+       __u32   bc_padding;
+};
+
 #endif /* _LUSTRE_BARRIER_USER_H */
index ecc6590..21b13d4 100644 (file)
@@ -425,6 +425,8 @@ obd_ioctl_unpack(struct obd_ioctl_data *data, char *pbuf, int max_len)
 /*     lustre/lustre_user.h    240-249 */
 /*     LIBCFS_IOC_DEBUG_MASK   250 */
 
+#define OBD_IOC_BARRIER                _IOWR('f', 261, OBD_IOC_DATA_TYPE)
+
 #define IOC_OSC_SET_ACTIVE     _IOWR('h', 21, void *)
 
 #endif /* LUSTRE_IOCTL_H_ */
index d1cafa0..958092c 100644 (file)
@@ -1,5 +1,6 @@
 MODULES := mgs
 mgs-objs := mgs_handler.o mgs_fs.o mgs_llog.o lproc_mgs.o mgs_nids.o
+mgs-objs += mgs_barrier.o
 
 EXTRA_DIST := $(mgs-objs:%.o=%.c) mgs_internal.h
 
diff --git a/lustre/mgs/mgs_barrier.c b/lustre/mgs/mgs_barrier.c
new file mode 100644 (file)
index 0000000..ef18273
--- /dev/null
@@ -0,0 +1,739 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2016, Intel Corporation.
+ *
+ * lustre/mgs/mgs_barrier.c
+ *
+ * Author: Fan, Yong <fan.yong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_MGS
+#define D_MGS D_CONFIG
+
+#include <lustre_ioctl.h>
+#include <lustre_swab.h>
+#include <lustre/lustre_barrier_user.h>
+
+#include "mgs_internal.h"
+
+/**
+ * Handle the barrier lock glimpse reply.
+ *
+ * The barrier lock glimpse reply contains the target MDT's index and
+ * the barrier operation status on such MDT. With such infomation. If
+ * the MDT given barrier status is the expected one, then set related
+ * 'fsdb''s barrier bitmap; otherwise record the failure or status.
+ *
+ * \param[in] env      pointer to the thread context
+ * \param[in] req      pointer to the glimpse callback RPC request
+ * \param[in] data     pointer the async glimpse callback data
+ * \param[in] rc       the glimpse callback RPC return value
+ *
+ * \retval             0 for success
+ * \retval             negative error number on failure
+ */
+static int mgs_barrier_gl_interpret_reply(const struct lu_env *env,
+                                         struct ptlrpc_request *req,
+                                         void *data, int rc)
+{
+       struct ldlm_cb_async_args *ca = data;
+       struct fs_db *fsdb = ca->ca_set_arg->gl_interpret_data;
+       struct barrier_lvb *lvb;
+       ENTRY;
+
+       if (rc) {
+               if (rc == -ENODEV)
+                       /* The lock is useless, cancel it. */
+                       ldlm_lock_cancel(ca->ca_lock);
+
+               GOTO(out, rc);
+       }
+
+       lvb = req_capsule_server_swab_get(&req->rq_pill, &RMF_DLM_LVB,
+                                         lustre_swab_barrier_lvb);
+       if (!lvb)
+               GOTO(out, rc = -EPROTO);
+
+       if (lvb->lvb_status == fsdb->fsdb_barrier_expected) {
+               if (unlikely(lvb->lvb_index > INDEX_MAP_SIZE))
+                       rc = -EINVAL;
+               else
+                       set_bit(lvb->lvb_index, fsdb->fsdb_barrier_map);
+       } else if (likely(!test_bit(lvb->lvb_index, fsdb->fsdb_barrier_map))) {
+               fsdb->fsdb_barrier_result = lvb->lvb_status;
+       }
+
+       GOTO(out, rc);
+
+out:
+       if (rc)
+               fsdb->fsdb_barrier_result = rc;
+
+       return rc;
+}
+
+/**
+ * Send glimpse callback to the barrier locks holders.
+ *
+ * The glimpse callback takes the current barrier status. The barrier locks
+ * holders (on the MDTs) will take related barrier actions according to the
+ * given barrier status, then return their local barrier status.
+ *
+ * \param[in] env      pointer to the thread context
+ * \param[in] mgs      pointer to the MGS device
+ * \param[in] fsdb     pointer the barrier 'fsdb'
+ * \param[in] timeout  indicate when the barrier will be expired
+ * \param[in] expected the expected barrier status on remote servers (MDTs)
+ *
+ * \retval             positive number for unexpected barrier status
+ * \retval             0 for success
+ * \retval             negative error number on failure
+ */
+static int mgs_barrier_glimpse_lock(const struct lu_env *env,
+                                   struct mgs_device *mgs,
+                                   struct fs_db *fsdb,
+                                   __u32 timeout, __u32 expected)
+{
+       union ldlm_gl_desc *desc = &mgs_env_info(env)->mgi_gl_desc;
+       struct ldlm_res_id res_id;
+       struct ldlm_resource *res;
+       struct ldlm_glimpse_work *work;
+       struct ldlm_glimpse_work *tmp;
+       struct list_head gl_list = LIST_HEAD_INIT(gl_list);
+       struct list_head *pos;
+       int i;
+       int rc;
+       ENTRY;
+
+       LASSERT(fsdb->fsdb_mdt_count > 0);
+
+       rc = mgc_logname2resid(fsdb->fsdb_name, &res_id, CONFIG_T_BARRIER);
+       if (rc)
+               RETURN(rc);
+
+       res = ldlm_resource_get(mgs->mgs_obd->obd_namespace, NULL, &res_id,
+                               LDLM_PLAIN, 0);
+       if (IS_ERR(res))
+               RETURN(PTR_ERR(res));
+
+       fsdb->fsdb_barrier_result = 0;
+       fsdb->fsdb_barrier_expected = expected;
+       desc->barrier_desc.lgbd_status = fsdb->fsdb_barrier_status;
+       desc->barrier_desc.lgbd_timeout = timeout;
+
+again:
+       list_for_each_entry(work, &gl_list, gl_list) {
+               if (!work->gl_lock)
+                       break;
+
+               LDLM_LOCK_RELEASE(work->gl_lock);
+               work->gl_lock = NULL;
+       }
+
+       /* It is not big issue to alloc more work item than needed. */
+       for (i = 0; i < fsdb->fsdb_mdt_count; i++) {
+               OBD_ALLOC_PTR(work);
+               if (!work)
+                       GOTO(out, rc = -ENOMEM);
+
+               list_add_tail(&work->gl_list, &gl_list);
+       }
+
+       work = list_entry(gl_list.next, struct ldlm_glimpse_work, gl_list);
+
+       lock_res(res);
+       list_for_each(pos, &res->lr_granted) {
+               struct ldlm_lock *lock = list_entry(pos, struct ldlm_lock,
+                                                   l_res_link);
+
+               work->gl_lock = LDLM_LOCK_GET(lock);
+               work->gl_flags = 0;
+               work->gl_desc = desc;
+               work->gl_interpret_reply = mgs_barrier_gl_interpret_reply;
+               work->gl_interpret_data = fsdb;
+
+               if (unlikely(work->gl_list.next == &gl_list)) {
+                       if (likely(pos->next == &res->lr_granted))
+                               break;
+
+                       unlock_res(res);
+                       /* The granted locks are more than the MDTs count. */
+                       goto again;
+               }
+
+               work = list_entry(work->gl_list.next, struct ldlm_glimpse_work,
+                                 gl_list);
+       }
+       unlock_res(res);
+
+       /* The MDTs count may be more than the granted locks. */
+       list_for_each_entry_safe_reverse(work, tmp, &gl_list, gl_list) {
+               if (work->gl_lock)
+                       break;
+
+               list_del(&work->gl_list);
+               OBD_FREE_PTR(work);
+       }
+
+       if (!list_empty(&gl_list))
+               rc = ldlm_glimpse_locks(res, &gl_list);
+       else
+               rc = -ENODEV;
+
+       GOTO(out, rc);
+
+out:
+       list_for_each_entry_safe(work, tmp, &gl_list, gl_list) {
+               list_del(&work->gl_list);
+               if (work->gl_lock)
+                       LDLM_LOCK_RELEASE(work->gl_lock);
+               OBD_FREE_PTR(work);
+       }
+
+       ldlm_resource_putref(res);
+       if (!rc)
+               rc = fsdb->fsdb_barrier_result;
+
+       return rc;
+}
+
+static void mgs_barrier_bitmap_setup(struct mgs_device *mgs,
+                                    struct fs_db *b_fsdb,
+                                    const char *name)
+{
+       struct fs_db *c_fsdb;
+
+       c_fsdb = mgs_find_fsdb(mgs, name);
+       if (likely(c_fsdb)) {
+               memcpy(b_fsdb->fsdb_mdt_index_map,
+                      c_fsdb->fsdb_mdt_index_map, INDEX_MAP_SIZE);
+               b_fsdb->fsdb_mdt_count = c_fsdb->fsdb_mdt_count;
+               mgs_put_fsdb(mgs, c_fsdb);
+       }
+}
+
+static bool mgs_barrier_done(struct fs_db *fsdb)
+{
+       int i;
+
+       for (i = 0; i < INDEX_MAP_SIZE * 8; i++) {
+               if (test_bit(i, fsdb->fsdb_mdt_index_map) &&
+                   !test_bit(i, fsdb->fsdb_barrier_map))
+                       return false;
+       }
+
+       return true;
+}
+
+/**
+ * Create the barrier for the given instance.
+ *
+ * We use two-phases barrier to guarantee that after the barrier setup:
+ * 1) All the server side pending async modification RPCs have been flushed.
+ * 2) Any subsequent modification will be blocked.
+ * 3) All async transactions on the MDTs have been committed.
+ *
+ * For phase1, we do the following:
+ *
+ * Firstly, it sets barrier flag on the instance that will block subsequent
+ * modifications from clients. (Note: server sponsored modification will be
+ * allowed for flush pending modifications)
+ *
+ * Secondly, it will flush all pending modification via dt_sync(), such as
+ * async OST-object destroy, async OST-object owner changes, and so on.
+ *
+ * If there are some on-handling clients sponsored modifications during the
+ * barrier creating, then related modifications may cause pending requests
+ * after the first dt_sync(), so call dt_sync() again after all on-handling
+ * modifications done.
+ *
+ * With the phase1 barrier set, all pending cross-servers modification RPCs
+ * have been flushed to remote servers, and any new modification will be
+ * blocked. But it does not guarantees that all the updates have been
+ * committed to storage on remote servers. So when all the instances have
+ * done phase1 barrier successfully, the MGS will notify all instances to
+ * do the phase2 barrier as following:
+ *
+ * Every barrier instance will call dt_sync() to make all async transactions
+ * to be committed locally.
+ *
+ * \param[in] env      pointer to the thread context
+ * \param[in] mgs      pointer to the MGS device
+ * \param[in] bc       pointer the barrier control structure
+ *
+ * \retval             0 for success
+ * \retval             negative error number on failure
+ */
+static int mgs_barrier_freeze(const struct lu_env *env,
+                             struct mgs_device *mgs,
+                             struct barrier_ctl *bc)
+{
+       char *name = mgs_env_info(env)->mgi_fsname;
+       struct fs_db *fsdb;
+       int rc = 0;
+       int left;
+       bool phase1 = true;
+       bool dirty = false;
+       ENTRY;
+
+       snprintf(name, sizeof(mgs_env_info(env)->mgi_fsname) - 1, "%s-%s",
+                bc->bc_name, BARRIER_FILENAME);
+
+       down_write(&mgs->mgs_barrier_rwsem);
+       mutex_lock(&mgs->mgs_mutex);
+
+       fsdb = mgs_find_fsdb(mgs, name);
+       if (!fsdb) {
+               mutex_unlock(&mgs->mgs_mutex);
+               up_write(&mgs->mgs_barrier_rwsem);
+
+               RETURN(-ENODEV);
+       }
+
+       if (unlikely(fsdb->fsdb_mdt_count == 0))
+               mgs_barrier_bitmap_setup(mgs, fsdb, bc->bc_name);
+
+       mutex_lock(&fsdb->fsdb_mutex);
+       mutex_unlock(&mgs->mgs_mutex);
+
+       switch (fsdb->fsdb_barrier_status) {
+       case BS_THAWING:
+       case BS_RESCAN:
+               rc = -EBUSY;
+               break;
+       case BS_FREEZING_P1:
+       case BS_FREEZING_P2:
+               rc = -EINPROGRESS;
+               break;
+       case BS_FROZEN:
+               if (cfs_time_before(cfs_time_current_sec(),
+                                   fsdb->fsdb_barrier_latest_create_time +
+                                   fsdb->fsdb_barrier_timeout)) {
+                       rc = -EALREADY;
+                       break;
+               }
+       case BS_INIT:
+       case BS_THAWED:
+       case BS_EXPIRED:
+       case BS_FAILED:
+               if (fsdb->fsdb_barrier_disabled) {
+                       rc = -EOPNOTSUPP;
+               } else if (unlikely(fsdb->fsdb_mdt_count == 0)) {
+                       rc = -ENODEV;
+               } else {
+                       fsdb->fsdb_barrier_latest_create_time =
+                                                       cfs_time_current_sec();
+                       fsdb->fsdb_barrier_status = BS_FREEZING_P1;
+                       if (bc->bc_timeout != 0)
+                               fsdb->fsdb_barrier_timeout = bc->bc_timeout;
+                       else
+                               fsdb->fsdb_barrier_timeout =
+                                               BARRIER_TIMEOUT_DEFAULT;
+                       memset(fsdb->fsdb_barrier_map, 0, INDEX_MAP_SIZE);
+               }
+               break;
+       default:
+               LCONSOLE_WARN("%s: found unexpected barrier status %u\n",
+                             bc->bc_name, fsdb->fsdb_barrier_status);
+               rc = -EINVAL;
+               LBUG();
+       }
+
+       if (rc)
+               GOTO(out, rc);
+
+       left = fsdb->fsdb_barrier_timeout;
+
+again:
+       mutex_unlock(&fsdb->fsdb_mutex);
+       up_write(&mgs->mgs_barrier_rwsem);
+
+       CFS_FAIL_TIMEOUT(OBD_FAIL_BARRIER_DELAY, cfs_fail_val);
+
+       rc = mgs_barrier_glimpse_lock(env, mgs, fsdb, left,
+                                     phase1 ? BS_FREEZING_P1 : BS_FROZEN);
+       down_write(&mgs->mgs_barrier_rwsem);
+       mutex_lock(&fsdb->fsdb_mutex);
+
+       dirty = true;
+       left = fsdb->fsdb_barrier_latest_create_time +
+               fsdb->fsdb_barrier_timeout - cfs_time_current_sec();
+       if (left <= 0) {
+               fsdb->fsdb_barrier_status = BS_EXPIRED;
+
+               GOTO(out, rc = -ETIME);
+       }
+
+       LASSERTF(fsdb->fsdb_barrier_status ==
+                (phase1 ? BS_FREEZING_P1 : BS_FREEZING_P2),
+                "unexpected barrier status %u\n",
+                fsdb->fsdb_barrier_status);
+
+       if (rc == -ETIMEDOUT) {
+               fsdb->fsdb_barrier_status = BS_EXPIRED;
+               rc = -ETIME;
+       } else if (rc > 0) {
+               fsdb->fsdb_barrier_status = rc;
+               rc = -EREMOTE;
+       } else if (rc < 0) {
+               fsdb->fsdb_barrier_status = BS_FAILED;
+       } else if (mgs_barrier_done(fsdb)) {
+               if (phase1) {
+                       fsdb->fsdb_barrier_status = BS_FREEZING_P2;
+                       memset(fsdb->fsdb_barrier_map, 0,
+                              INDEX_MAP_SIZE);
+                       phase1 = false;
+
+                       goto again;
+               } else {
+                       fsdb->fsdb_barrier_status = BS_FROZEN;
+               }
+       } else {
+               fsdb->fsdb_barrier_status = BS_FAILED;
+               rc = -EREMOTE;
+       }
+
+       GOTO(out, rc);
+
+out:
+       mutex_unlock(&fsdb->fsdb_mutex);
+       up_write(&mgs->mgs_barrier_rwsem);
+       if (rc && dirty) {
+               memset(fsdb->fsdb_barrier_map, 0, INDEX_MAP_SIZE);
+               mgs_barrier_glimpse_lock(env, mgs, fsdb, 0, BS_THAWED);
+       }
+
+       mgs_put_fsdb(mgs, fsdb);
+
+       return rc;
+}
+
+static int mgs_barrier_thaw(const struct lu_env *env,
+                           struct mgs_device *mgs,
+                           struct barrier_ctl *bc)
+{
+       char *name = mgs_env_info(env)->mgi_fsname;
+       struct fs_db *fsdb;
+       int rc = 0;
+       ENTRY;
+
+       snprintf(name, sizeof(mgs_env_info(env)->mgi_fsname) - 1, "%s-%s",
+                bc->bc_name, BARRIER_FILENAME);
+
+       down_write(&mgs->mgs_barrier_rwsem);
+       mutex_lock(&mgs->mgs_mutex);
+
+       fsdb = mgs_find_fsdb(mgs, name);
+       if (!fsdb) {
+               mutex_unlock(&mgs->mgs_mutex);
+               up_write(&mgs->mgs_barrier_rwsem);
+
+               RETURN(-ENODEV);
+       }
+
+       if (unlikely(fsdb->fsdb_mdt_count == 0))
+               mgs_barrier_bitmap_setup(mgs, fsdb, bc->bc_name);
+
+       mutex_lock(&fsdb->fsdb_mutex);
+       mutex_unlock(&mgs->mgs_mutex);
+
+       switch (fsdb->fsdb_barrier_status) {
+       case BS_FREEZING_P1:
+       case BS_FREEZING_P2:
+       case BS_RESCAN:
+               rc = -EBUSY;
+               break;
+       case BS_INIT:
+       case BS_THAWED:
+               rc = -EALREADY;
+               break;
+       case BS_THAWING:
+               rc = -EINPROGRESS;
+               break;
+       case BS_FROZEN:
+       case BS_EXPIRED: /* The barrier on some MDT(s) may be expired,
+                         * but may be not on others. Destory anyway. */
+       case BS_FAILED:
+               if (unlikely(fsdb->fsdb_mdt_count == 0)) {
+                       rc = -ENODEV;
+               } else {
+                       fsdb->fsdb_barrier_status = BS_THAWING;
+                       memset(fsdb->fsdb_barrier_map, 0, INDEX_MAP_SIZE);
+               }
+               break;
+       default:
+               LCONSOLE_WARN("%s: found unexpected barrier status %u\n",
+                             bc->bc_name, fsdb->fsdb_barrier_status);
+               rc = -EINVAL;
+               LBUG();
+       }
+
+       if (rc)
+               GOTO(out, rc);
+
+       mutex_unlock(&fsdb->fsdb_mutex);
+       up_write(&mgs->mgs_barrier_rwsem);
+
+       CFS_FAIL_TIMEOUT(OBD_FAIL_BARRIER_DELAY, cfs_fail_val);
+
+       rc = mgs_barrier_glimpse_lock(env, mgs, fsdb, 0, BS_THAWED);
+       down_write(&mgs->mgs_barrier_rwsem);
+       mutex_lock(&fsdb->fsdb_mutex);
+
+       LASSERTF(fsdb->fsdb_barrier_status == BS_THAWING,
+                "unexpected barrier status %u\n",
+                fsdb->fsdb_barrier_status);
+
+       if (rc > 0) {
+               fsdb->fsdb_barrier_status = rc;
+               rc = -EREMOTE;
+       } else if (rc < 0) {
+               fsdb->fsdb_barrier_status = BS_FAILED;
+       } else if (mgs_barrier_done(fsdb)) {
+               fsdb->fsdb_barrier_status = BS_THAWED;
+       } else {
+               fsdb->fsdb_barrier_status = BS_FAILED;
+               rc = -EREMOTE;
+       }
+
+       GOTO(out, rc);
+
+out:
+       mutex_unlock(&fsdb->fsdb_mutex);
+       up_write(&mgs->mgs_barrier_rwsem);
+       mgs_put_fsdb(mgs, fsdb);
+
+       return rc;
+}
+
+static int mgs_barrier_stat(const struct lu_env *env,
+                           struct mgs_device *mgs,
+                           struct barrier_ctl *bc)
+{
+       char *name = mgs_env_info(env)->mgi_fsname;
+       struct fs_db *fsdb;
+       ENTRY;
+
+       snprintf(name, sizeof(mgs_env_info(env)->mgi_fsname) - 1, "%s-%s",
+                bc->bc_name, BARRIER_FILENAME);
+
+       mutex_lock(&mgs->mgs_mutex);
+
+       fsdb = mgs_find_fsdb(mgs, name);
+       if (fsdb) {
+               mutex_lock(&fsdb->fsdb_mutex);
+               mutex_unlock(&mgs->mgs_mutex);
+
+               bc->bc_status = fsdb->fsdb_barrier_status;
+               if (bc->bc_status == BS_FREEZING_P1 ||
+                   bc->bc_status == BS_FREEZING_P2 ||
+                   bc->bc_status == BS_FROZEN) {
+                       if (cfs_time_before(cfs_time_current_sec(),
+                                       fsdb->fsdb_barrier_latest_create_time +
+                                       fsdb->fsdb_barrier_timeout))
+                               bc->bc_timeout =
+                                       fsdb->fsdb_barrier_latest_create_time +
+                                       fsdb->fsdb_barrier_timeout -
+                                       cfs_time_current_sec();
+                       else
+                               bc->bc_status = fsdb->fsdb_barrier_status =
+                                       BS_EXPIRED;
+               }
+
+               mutex_unlock(&fsdb->fsdb_mutex);
+               mgs_put_fsdb(mgs, fsdb);
+       } else {
+               mutex_unlock(&mgs->mgs_mutex);
+
+               bc->bc_status = BS_INIT;
+       }
+
+       RETURN(0);
+}
+
+static int mgs_barrier_rescan(const struct lu_env *env,
+                             struct mgs_device *mgs,
+                             struct barrier_ctl *bc)
+{
+       char *name = mgs_env_info(env)->mgi_fsname;
+       struct fs_db *b_fsdb;
+       struct fs_db *c_fsdb;
+       int rc = 0;
+       ENTRY;
+
+       down_write(&mgs->mgs_barrier_rwsem);
+       mutex_lock(&mgs->mgs_mutex);
+
+       c_fsdb = mgs_find_fsdb(mgs, bc->bc_name);
+       if (!c_fsdb || unlikely(c_fsdb->fsdb_mdt_count == 0)) {
+               mutex_unlock(&mgs->mgs_mutex);
+               up_write(&mgs->mgs_barrier_rwsem);
+
+               RETURN(-ENODEV);
+       }
+
+       snprintf(name, sizeof(mgs_env_info(env)->mgi_fsname) - 1, "%s-%s",
+                bc->bc_name, BARRIER_FILENAME);
+       b_fsdb = mgs_find_fsdb(mgs, name);
+       if (!b_fsdb) {
+               mutex_unlock(&mgs->mgs_mutex);
+               up_write(&mgs->mgs_barrier_rwsem);
+               mgs_put_fsdb(mgs, c_fsdb);
+
+               RETURN(-ENODEV);
+       }
+
+       mutex_lock(&b_fsdb->fsdb_mutex);
+       mutex_lock(&c_fsdb->fsdb_mutex);
+       mutex_unlock(&mgs->mgs_mutex);
+
+       switch (b_fsdb->fsdb_barrier_status) {
+       case BS_RESCAN:
+               rc = -EINPROGRESS;
+               break;
+       case BS_THAWING:
+       case BS_FREEZING_P1:
+       case BS_FREEZING_P2:
+               rc = -EBUSY;
+               break;
+       case BS_FROZEN:
+               if (cfs_time_before(cfs_time_current_sec(),
+                                   b_fsdb->fsdb_barrier_latest_create_time +
+                                   b_fsdb->fsdb_barrier_timeout)) {
+                       rc = -EBUSY;
+                       break;
+               }
+       case BS_INIT:
+       case BS_THAWED:
+       case BS_EXPIRED:
+       case BS_FAILED:
+               b_fsdb->fsdb_barrier_latest_create_time =
+                                                       cfs_time_current_sec();
+               b_fsdb->fsdb_barrier_status = BS_RESCAN;
+               memcpy(b_fsdb->fsdb_mdt_index_map, c_fsdb->fsdb_mdt_index_map,
+                      INDEX_MAP_SIZE);
+               memset(b_fsdb->fsdb_barrier_map, 0, INDEX_MAP_SIZE);
+               b_fsdb->fsdb_mdt_count = c_fsdb->fsdb_mdt_count;
+               break;
+       default:
+               LCONSOLE_WARN("%s: found unexpected barrier status %u\n",
+                             bc->bc_name, b_fsdb->fsdb_barrier_status);
+               rc = -EINVAL;
+               LBUG();
+       }
+
+       mutex_unlock(&c_fsdb->fsdb_mutex);
+       mgs_put_fsdb(mgs, c_fsdb);
+
+       if (rc)
+               GOTO(out, rc);
+
+again:
+       mutex_unlock(&b_fsdb->fsdb_mutex);
+       up_write(&mgs->mgs_barrier_rwsem);
+       rc = mgs_barrier_glimpse_lock(env, mgs, b_fsdb, 0, BS_INIT);
+       down_write(&mgs->mgs_barrier_rwsem);
+       mutex_lock(&b_fsdb->fsdb_mutex);
+
+       LASSERTF(b_fsdb->fsdb_barrier_status == BS_RESCAN,
+                "unexpected barrier status %u\n",
+                b_fsdb->fsdb_barrier_status);
+
+       if (rc > 0) {
+               b_fsdb->fsdb_barrier_status = rc;
+               rc = -EREMOTE;
+       } else if (rc == -ETIMEDOUT &&
+                  cfs_time_before(cfs_time_current_sec(),
+                                  b_fsdb->fsdb_barrier_latest_create_time +
+                                  bc->bc_timeout)) {
+               memset(b_fsdb->fsdb_barrier_map, 0, INDEX_MAP_SIZE);
+
+               goto again;
+       } else if (rc < 0 && rc != -ETIMEDOUT && rc != -ENODEV) {
+               b_fsdb->fsdb_barrier_status = BS_FAILED;
+       } else {
+               int i;
+
+               b_fsdb->fsdb_mdt_count = 0;
+               bc->bc_total = 0;
+               bc->bc_absence = 0;
+               rc = 0;
+               for (i = 0; i < INDEX_MAP_SIZE * 8; i++) {
+                       if (test_bit(i, b_fsdb->fsdb_barrier_map)) {
+                               b_fsdb->fsdb_mdt_count++;
+                       } else if (test_bit(i, b_fsdb->fsdb_mdt_index_map)) {
+                               b_fsdb->fsdb_mdt_count++;
+                               bc->bc_absence++;
+                       }
+               }
+
+               bc->bc_total = b_fsdb->fsdb_mdt_count;
+               memcpy(b_fsdb->fsdb_mdt_index_map,
+                      b_fsdb->fsdb_barrier_map, INDEX_MAP_SIZE);
+               b_fsdb->fsdb_barrier_status = BS_INIT;
+       }
+
+       GOTO(out, rc);
+
+out:
+       mutex_unlock(&b_fsdb->fsdb_mutex);
+       up_write(&mgs->mgs_barrier_rwsem);
+       mgs_put_fsdb(mgs, b_fsdb);
+
+       return rc;
+}
+
+int mgs_iocontrol_barrier(const struct lu_env *env,
+                         struct mgs_device *mgs,
+                         struct obd_ioctl_data *data)
+{
+       struct barrier_ctl *bc = (struct barrier_ctl *)(data->ioc_inlbuf1);
+       int rc;
+       ENTRY;
+
+       if (unlikely(bc->bc_version != BARRIER_VERSION_V1))
+               RETURN(-EOPNOTSUPP);
+
+       if (unlikely(strnlen(bc->bc_name, sizeof(bc->bc_name)) > 8))
+               RETURN(-EINVAL);
+
+       switch (bc->bc_cmd) {
+       case BC_FREEZE:
+               rc = mgs_barrier_freeze(env, mgs, bc);
+               break;
+       case BC_THAW:
+               rc = mgs_barrier_thaw(env, mgs, bc);
+               break;
+       case BC_STAT:
+               rc = mgs_barrier_stat(env, mgs, bc);
+               break;
+       case BC_RESCAN:
+               rc = mgs_barrier_rescan(env, mgs, bc);
+               break;
+       default:
+               rc = -EINVAL;
+               break;
+       }
+
+       RETURN(rc);
+}
index 7265695..c33e780 100644 (file)
@@ -1067,6 +1067,10 @@ out_free:
                rc = mgs_iocontrol_pool(&env, mgs, data);
                break;
 
+       case OBD_IOC_BARRIER:
+               rc = mgs_iocontrol_barrier(&env, mgs, data);
+               break;
+
        case OBD_IOC_NODEMAP:
                rc = mgs_iocontrol_nodemap(&env, mgs, data);
                break;
index 46979e9..b9b1ea4 100644 (file)
@@ -132,6 +132,8 @@ struct fs_db {
        unsigned long     fsdb_flags;
        __u32             fsdb_barrier_status;
        __u32             fsdb_barrier_timeout;
+       __u32             fsdb_barrier_expected;
+       int               fsdb_barrier_result;
        time_t            fsdb_barrier_latest_create_time;
 
         /* in-memory copy of the srpc rules, guarded by fsdb_lock */
@@ -258,6 +260,11 @@ int mgs_client_free(struct obd_export *exp);
 int mgs_fs_setup(const struct lu_env *env, struct mgs_device *m);
 int mgs_fs_cleanup(const struct lu_env *env, struct mgs_device *m);
 
+/* mgs_barrier.c */
+int mgs_iocontrol_barrier(const struct lu_env *env,
+                         struct mgs_device *mgs,
+                         struct obd_ioctl_data *data);
+
 #ifdef CONFIG_PROC_FS
 int lproc_mgs_setup(struct mgs_device *mgs, const char *osd_name);
 void lproc_mgs_cleanup(struct mgs_device *mgs);
@@ -298,6 +305,7 @@ struct mgs_thread_info {
        char                    mgi_fsname[MTI_NAME_MAXLEN];
        struct cfg_marker       mgi_marker;
        struct temp_comp        mgi_comp;
+       union ldlm_gl_desc      mgi_gl_desc;
 };
 
 extern struct lu_context_key mgs_thread_key;
index e52d303..6145e0e 100644 (file)
@@ -323,6 +323,9 @@ int barrier_handler(struct dt_device *key, struct ptlrpc_request *req)
                break;
        case BS_FREEZING_P1:
        case BS_FREEZING_P2:
+               if (OBD_FAIL_CHECK(OBD_FAIL_BARRIER_FAILURE))
+                       GOTO(fini, rc = -EINVAL);
+
                barrier->bi_deadline = cfs_time_current_sec() +
                                        desc->lgbd_timeout;
                rc = barrier_freeze(&env, barrier,
index 5ec81b2..839469c 100755 (executable)
@@ -15931,6 +15931,203 @@ test_409()
 }
 run_test 409 "Large amount of cross-MDTs hard links on the same file"
 
+prep_801() {
+       start_full_debug_logging
+       # cleanup unused barrier locks before test
+       do_facet mgs $LCTL barrier_rescan $FSNAME ||
+               error "Fail to prep barrier test env"
+}
+
+post_801() {
+       stop_full_debug_logging
+}
+
+test_801a() {
+       prep_801
+
+       #define OBD_FAIL_BARRIER_DELAY          0x2202
+       do_facet mgs $LCTL set_param fail_val=3 fail_loc=0x2202
+       do_facet mgs $LCTL barrier_freeze $FSNAME 10 &
+
+       sleep 1
+       local b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+                              awk '/The barrier for/ { print $7 }')
+       [ "$b_status" = "'freezing_p1'" ] ||
+               error "(1) unexpected barrier status $b_status"
+
+       do_facet mgs $LCTL set_param fail_val=0 fail_loc=0
+       wait
+       b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+                        awk '/The barrier for/ { print $7 }')
+       [ "$b_status" = "'frozen'" ] ||
+               error "(2) unexpected barrier status $b_status"
+
+       local expired=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+                       awk '/will be expired/ { print $7 }')
+       echo "sleep $((expired + 3)) seconds, then the barrier will be expired"
+       sleep $((expired + 3))
+
+       b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+                        awk '/The barrier for/ { print $7 }')
+       [ "$b_status" = "'expired'" ] ||
+               error "(3) unexpected barrier status $b_status"
+
+       do_facet mgs $LCTL barrier_freeze $FSNAME 10 ||
+               error "(4) fail to freeze barrier"
+
+       b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+                        awk '/The barrier for/ { print $7 }')
+       [ "$b_status" = "'frozen'" ] ||
+               error "(5) unexpected barrier status $b_status"
+
+       #define OBD_FAIL_BARRIER_DELAY          0x2202
+       do_facet mgs $LCTL set_param fail_val=3 fail_loc=0x2202
+       do_facet mgs $LCTL barrier_thaw $FSNAME &
+
+       sleep 1
+       b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+                        awk '/The barrier for/ { print $7 }')
+       [ "$b_status" = "'thawing'" ] ||
+               error "(6) unexpected barrier status $b_status"
+
+       do_facet mgs $LCTL set_param fail_val=0 fail_loc=0
+       wait
+       b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+                        awk '/The barrier for/ { print $7 }')
+       [ "$b_status" = "'thawed'" ] ||
+               error "(7) unexpected barrier status $b_status"
+
+       #define OBD_FAIL_BARRIER_FAILURE        0x2203
+       do_facet $SINGLEMDS $LCTL set_param fail_loc=0x2203
+       do_facet mgs $LCTL barrier_freeze $FSNAME
+
+       b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+                              awk '/The barrier for/ { print $7 }')
+       [ "$b_status" = "'failed'" ] ||
+               error "(8) unexpected barrier status $b_status"
+
+       do_facet $SINGLEMDS $LCTL set_param fail_loc=0
+       do_facet mgs $LCTL barrier_thaw $FSNAME
+
+       post_801
+}
+run_test 801a "write barrier user interfaces and stat machine"
+
+test_801b() {
+       prep_801
+
+       mkdir $DIR/$tdir || error "(1) fail to mkdir"
+       createmany -d $DIR/$tdir/d 6 || "(2) fail to mkdir"
+       touch $DIR/$tdir/d2/f10 || error "(3) fail to touch"
+       touch $DIR/$tdir/d3/f11 || error "(4) fail to touch"
+       touch $DIR/$tdir/d4/f12 || error "(5) fail to touch"
+
+       cancel_lru_locks mdc
+
+       # 180 seconds should be long enough
+       do_facet mgs $LCTL barrier_freeze $FSNAME 180
+
+       local b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+                              awk '/The barrier for/ { print $7 }')
+       [ "$b_status" = "'frozen'" ] ||
+               error "(6) unexpected barrier status $b_status"
+
+       mkdir $DIR/$tdir/d0/d10 &
+       mkdir_pid=$!
+
+       touch $DIR/$tdir/d1/f13 &
+       touch_pid=$!
+
+       ln $DIR/$tdir/d2/f10 $DIR/$tdir/d2/f14 &
+       ln_pid=$!
+
+       mv $DIR/$tdir/d3/f11 $DIR/$tdir/d3/f15 &
+       mv_pid=$!
+
+       rm -f $DIR/$tdir/d4/f12 &
+       rm_pid=$!
+
+       stat $DIR/$tdir/d5 || error "(7) stat should succeed"
+
+       # To guarantee taht the 'stat' is not blocked
+       b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+                        awk '/The barrier for/ { print $7 }')
+       [ "$b_status" = "'frozen'" ] ||
+               error "(8) unexpected barrier status $b_status"
+
+       # let above commands to run at background
+       sleep 5
+
+       ps -p $mkdir_pid || error "(9) mkdir should be blocked"
+       ps -p $touch_pid || error "(10) touch should be blocked"
+       ps -p $ln_pid || error "(11) link should be blocked"
+       ps -p $mv_pid || error "(12) rename should be blocked"
+       ps -p $rm_pid || error "(13) unlink should be blocked"
+
+       b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+                        awk '/The barrier for/ { print $7 }')
+       [ "$b_status" = "'frozen'" ] ||
+               error "(14) unexpected barrier status $b_status"
+
+       do_facet mgs $LCTL barrier_thaw $FSNAME
+       b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+                        awk '/The barrier for/ { print $7 }')
+       [ "$b_status" = "'thawed'" ] ||
+               error "(15) unexpected barrier status $b_status"
+
+       wait $mkdir_pid || error "(16) mkdir should succeed"
+       wait $touch_pid || error "(17) touch should succeed"
+       wait $ln_pid || error "(18) link should succeed"
+       wait $mv_pid || error "(19) rename should succeed"
+       wait $rm_pid || error "(20) unlink should succeed"
+
+       post_801
+}
+run_test 801b "modification will be blocked by write barrier"
+
+test_801c() {
+       [[ $MDSCOUNT -lt 2 ]] && skip "needs >= 2 MDTs" && return
+
+       prep_801
+
+       stop mds2 || error "(1) Fail to stop mds2"
+
+       do_facet mgs $LCTL barrier_freeze $FSNAME 30
+
+       local b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+                              awk '/The barrier for/ { print $7 }')
+       [ "$b_status" = "'expired'" -o "$b_status" = "'failed'" ] || {
+               do_facet mgs $LCTL barrier_thaw $FSNAME
+               error "(2) unexpected barrier status $b_status"
+       }
+
+       do_facet mgs $LCTL barrier_rescan $FSNAME ||
+               error "(3) Fail to rescan barrier bitmap"
+
+       do_facet mgs $LCTL barrier_freeze $FSNAME 10
+
+       b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+                        awk '/The barrier for/ { print $7 }')
+       [ "$b_status" = "'frozen'" ] ||
+               error "(4) unexpected barrier status $b_status"
+
+       do_facet mgs $LCTL barrier_thaw $FSNAME
+       b_status=$(do_facet mgs $LCTL barrier_stat $FSNAME |
+                        awk '/The barrier for/ { print $7 }')
+       [ "$b_status" = "'thawed'" ] ||
+               error "(5) unexpected barrier status $b_status"
+
+       local devname=$(mdsdevname 2)
+
+       start mds2 $devname $MDS_MOUNT_OPTS || error "(6) Fail to start mds2"
+
+       do_facet mgs $LCTL barrier_rescan $FSNAME ||
+               error "(7) Fail to rescan barrier bitmap"
+
+       post_801
+}
+run_test 801c "rescan barrier bitmap"
+
 #
 # tests that do cleanup/setup should be run at the end
 #
index 7af645d..5975162 100644 (file)
@@ -245,6 +245,21 @@ command_t cmdlist[] = {
         "list pools and pools members\n"
         "usage: pool_list  <fsname>[.<poolname>] | <pathname>"},
 
+       /* Barrier commands */
+       {"===  Barrier ==", NULL, 0, "barrier management"},
+       {"barrier_freeze", jt_barrier_freeze, 0,
+        "freeze write barrier on MDTs\n"
+        "usage: barrier_freeze <fsname> [timeout (in seconds)]"},
+       {"barrier_thaw", jt_barrier_thaw, 0,
+        "thaw write barrier on MDTs\n"
+        "usage: barrier_thaw <fsname>"},
+       {"barrier_stat", jt_barrier_stat, 0,
+        "query write barrier status on MDTs\n"
+        "usage: barrier_stat <fsname>"},
+       {"barrier_rescan", jt_barrier_rescan, 0,
+        "rescan the system to filter out inactive MDT(s) for barrier\n"
+        "usage: barrier_rescan <fsname> [timeout (in seconds)]"},
+
        /* Nodemap commands */
        {"=== Nodemap ===", NULL, 0, "nodemap management"},
        {"nodemap_activate", jt_nodemap_activate, 0,
index aa99316..c4a68a0 100644 (file)
@@ -75,6 +75,7 @@
 #include <lnet/lnetctl.h>
 #include <lustre/lustreapi.h>
 #include <lustre_param.h>
+#include <lustre/lustre_barrier_user.h>
 
 #define MAX_STRING_SIZE 128
 
@@ -4064,6 +4065,227 @@ out:
         return rc;
 }
 
+static const char *barrier_status2name(enum barrier_status status)
+{
+       switch (status) {
+       case BS_INIT:
+               return "init";
+       case BS_FREEZING_P1:
+               return "freezing_p1";
+       case BS_FREEZING_P2:
+               return "freezing_p2";
+       case BS_FROZEN:
+               return "frozen";
+       case BS_THAWING:
+               return "thawing";
+       case BS_THAWED:
+               return "thawed";
+       case BS_FAILED:
+               return "failed";
+       case BS_EXPIRED:
+               return "expired";
+       case BS_RESCAN:
+               return "rescan";
+       default:
+               return "unknown";
+       }
+}
+
+int jt_barrier_freeze(int argc, char **argv)
+{
+       struct obd_ioctl_data data;
+       char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
+       struct barrier_ctl bc;
+       int rc;
+
+       if (argc < 2 || argc > 3)
+               return CMD_HELP;
+
+       memset(&data, 0, sizeof(data));
+       rc = data.ioc_dev = get_mgs_device();
+       if (rc < 0)
+               return rc;
+
+       memset(&bc, 0, sizeof(bc));
+       bc.bc_version = BARRIER_VERSION_V1;
+       bc.bc_cmd = BC_FREEZE;
+       if (argc == 3)
+               bc.bc_timeout = atoi(argv[2]);
+       if (bc.bc_timeout == 0)
+               bc.bc_timeout = BARRIER_TIMEOUT_DEFAULT;
+
+       if (strlen(argv[1]) > 8) {
+               fprintf(stderr, "%s: fsname name %s is too long. "
+                       "It should not exceed 8.\n", argv[0], argv[1]);
+               return -EINVAL;
+       }
+
+       strncpy(bc.bc_name, argv[1], sizeof(bc.bc_name));
+       data.ioc_inlbuf1 = (char *)&bc;
+       data.ioc_inllen1 = sizeof(bc);
+       memset(buf, 0, sizeof(rawbuf));
+       rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+       if (rc) {
+               fprintf(stderr, "Fail to pack ioctl data: rc = %d.\n", rc);
+               return rc;
+       }
+
+       rc = l_ioctl(OBD_DEV_ID, OBD_IOC_BARRIER, buf);
+       if (rc < 0)
+               fprintf(stderr, "Fail to freeze barrier for %s: %s\n",
+                       argv[1], strerror(errno));
+
+       return rc;
+}
+
+int jt_barrier_thaw(int argc, char **argv)
+{
+       struct obd_ioctl_data data;
+       char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
+       struct barrier_ctl bc;
+       int rc;
+
+       if (argc != 2)
+               return CMD_HELP;
+
+       memset(&data, 0, sizeof(data));
+       rc = data.ioc_dev = get_mgs_device();
+       if (rc < 0)
+               return rc;
+
+       memset(&bc, 0, sizeof(bc));
+       bc.bc_version = BARRIER_VERSION_V1;
+       bc.bc_cmd = BC_THAW;
+
+       if (strlen(argv[1]) > 8) {
+               fprintf(stderr, "fsname name %s is too long. "
+                       "It should not exceed 8.\n", argv[1]);
+               return -EINVAL;
+       }
+
+       strncpy(bc.bc_name, argv[1], sizeof(bc.bc_name));
+       data.ioc_inlbuf1 = (char *)&bc;
+       data.ioc_inllen1 = sizeof(bc);
+       memset(buf, 0, sizeof(rawbuf));
+       rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+       if (rc) {
+               fprintf(stderr, "Fail to pack ioctl data: rc = %d.\n", rc);
+               return rc;
+       }
+
+       rc = l_ioctl(OBD_DEV_ID, OBD_IOC_BARRIER, buf);
+       if (rc < 0)
+               fprintf(stderr, "Fail to thaw barrier for %s: %s\n",
+                       argv[1], strerror(errno));
+
+       return rc;
+}
+
+int jt_barrier_stat(int argc, char **argv)
+{
+       struct obd_ioctl_data data;
+       char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
+       struct barrier_ctl bc;
+       int rc;
+
+       if (argc != 2)
+               return CMD_HELP;
+
+       memset(&data, 0, sizeof(data));
+       rc = data.ioc_dev = get_mgs_device();
+       if (rc < 0)
+               return rc;
+
+       memset(&bc, 0, sizeof(bc));
+       bc.bc_version = BARRIER_VERSION_V1;
+       bc.bc_cmd = BC_STAT;
+
+       if (strlen(argv[1]) > 8) {
+               fprintf(stderr, "fsname name %s is too long. "
+                       "It should not exceed 8.\n", argv[1]);
+               return -EINVAL;
+       }
+
+       strncpy(bc.bc_name, argv[1], sizeof(bc.bc_name));
+       data.ioc_inlbuf1 = (char *)&bc;
+       data.ioc_inllen1 = sizeof(bc);
+       memset(buf, 0, sizeof(rawbuf));
+       rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+       if (rc) {
+               fprintf(stderr, "Fail to pack ioctl data: rc = %d.\n", rc);
+               return rc;
+       }
+
+       rc = l_ioctl(OBD_DEV_ID, OBD_IOC_BARRIER, buf);
+       if (rc < 0) {
+               fprintf(stderr, "Fail to query barrier for %s: %s\n",
+                       argv[1], strerror(errno));
+       } else {
+               obd_ioctl_unpack(&data, buf, sizeof(rawbuf));
+               printf("The barrier for %s is in '%s'\n",
+                      argv[1], barrier_status2name(bc.bc_status));
+               if (bc.bc_status == BS_FREEZING_P1 ||
+                   bc.bc_status == BS_FREEZING_P2 ||
+                   bc.bc_status == BS_FROZEN)
+                       printf("The barrier will be expired after %d "
+                              "seconds\n", bc.bc_timeout);
+       }
+
+       return rc;
+}
+
+int jt_barrier_rescan(int argc, char **argv)
+{
+       struct obd_ioctl_data data;
+       char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
+       struct barrier_ctl bc;
+       int rc;
+
+       if (argc < 2 || argc > 3)
+               return CMD_HELP;
+
+       memset(&data, 0, sizeof(data));
+       rc = data.ioc_dev = get_mgs_device();
+       if (rc < 0)
+               return rc;
+
+       memset(&bc, 0, sizeof(bc));
+       bc.bc_version = BARRIER_VERSION_V1;
+       bc.bc_cmd = BC_RESCAN;
+       if (argc == 3)
+               bc.bc_timeout = atoi(argv[2]);
+       if (bc.bc_timeout == 0)
+               bc.bc_timeout = BARRIER_TIMEOUT_DEFAULT;
+
+       if (strlen(argv[1]) > 8) {
+               fprintf(stderr, "fsname name %s is too long. "
+                       "It should not exceed 8.\n", argv[1]);
+               return -EINVAL;
+       }
+
+       strncpy(bc.bc_name, argv[1], sizeof(bc.bc_name));
+       data.ioc_inlbuf1 = (char *)&bc;
+       data.ioc_inllen1 = sizeof(bc);
+       memset(buf, 0, sizeof(rawbuf));
+       rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+       if (rc) {
+               fprintf(stderr, "Fail to pack ioctl data: rc = %d.\n", rc);
+               return rc;
+       }
+
+       rc = l_ioctl(OBD_DEV_ID, OBD_IOC_BARRIER, buf);
+       if (rc < 0) {
+               fprintf(stderr, "Fail to rescan barrier bitmap for %s: %s\n",
+                       argv[1], strerror(errno));
+       } else {
+               obd_ioctl_unpack(&data, buf, sizeof(rawbuf));
+               printf("%u of %u MDT(s) in the filesystem %s are inactive\n",
+                      bc.bc_absence, bc.bc_total, argv[1]);
+       }
+
+       return rc;
+}
+
 int jt_get_obj_version(int argc, char **argv)
 {
        struct lu_fid fid;
index c20e374..be6a30e 100644 (file)
@@ -161,6 +161,10 @@ int jt_lcfg_setparam(int argc, char **argv);
 int jt_lcfg_listparam(int argc, char **argv);
 
 int jt_pool_cmd(int argc, char **argv);
+int jt_barrier_freeze(int argc, char **argv);
+int jt_barrier_thaw(int argc, char **argv);
+int jt_barrier_stat(int argc, char **argv);
+int jt_barrier_rescan(int argc, char **argv);
 int jt_nodemap_activate(int argc, char **argv);
 int jt_nodemap_add(int argc, char **argv);
 int jt_nodemap_del(int argc, char **argv);