Whamcloud - gitweb
LU-8900 snapshot: operate write barrier on MDT 63/24263/15
authorFan Yong <fan.yong@intel.com>
Wed, 12 Oct 2016 08:11:24 +0000 (16:11 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Tue, 14 Mar 2017 02:57:51 +0000 (02:57 +0000)
Currently, the Lustre barrier is implemented as write barrier
on all MDTs. For each MDT in the system, when it starts, it
registers a barrier instance that will be used in handling
subsequent barrier requests.

The barrier_handler() processes the barrier request: freeze
or thaw the barrier on the MDT. Freezing barrier is the key
part. We use two-phases barrier to guarantee that after the
barrier setup:

1) All the MDT side pending async modifications have been flushed.
2) Any subsequent modification will be blocked.
3) All async transactions on the MDTs have been committed.

For phase1, we do the following:

Firstly, it sets barrier flag on the instance that will block
subsequent modifications from clients. (Note: server sponsored
modification will be allowed for flush pending modifications)

Secondly, it will flush all pending modification via dt_sync(),
such as async OST-object destroy, async OST-object owner changes,
and so on.

If there are some on-handling clients sponsored modifications
during the barrier freezing, then related modifications may cause
pending requests after the first dt_sync(), so call dt_sync()
again after all on-handling modifications done.

With the phase1 barrier set, all pending cross-servers modification
have been flushed to remote servers, and any new modification will
be blocked. But it does not guarantees that all the updates have been
committed to storage on remote servers. So when all the instances
have done phase1 barrier successfully, the MGS will notify all
instances to do the phase2 barrier as following:

Every barrier instance will call dt_sync() to make all async
transactions to be committed locally.

Signed-off-by: Fan Yong <fan.yong@intel.com>
Change-Id: I8d209e98d175eacdadd25c385ffc1c3e4451527a
Reviewed-on: https://review.whamcloud.com/24263
Tested-by: Jenkins
Reviewed-by: Niu Yawei <yawei.niu@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Lai Siyao <lai.siyao@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
libcfs/include/libcfs/libcfs_debug.h
lustre/include/Makefile.am
lustre/include/lustre_barrier.h [new file with mode: 0644]
lustre/lod/lod_dev.c
lustre/mdd/mdd_device.c
lustre/mgc/mgc_request.c
lustre/ptlrpc/Makefile.in
lustre/target/Makefile.am
lustre/target/barrier.c [new file with mode: 0644]
lustre/target/tgt_internal.h
lustre/target/tgt_main.c

index 1f23b60..eff16dc 100644 (file)
@@ -104,7 +104,7 @@ struct ptldebug_header {
 #define S_LQUOTA       0x00040000
 #define S_OSD          0x00080000
 #define S_LFSCK                0x00100000
 #define S_LQUOTA       0x00040000
 #define S_OSD          0x00080000
 #define S_LFSCK                0x00100000
-/* unused */
+#define S_SNAPSHOT     0x00200000
 /* unused */
 #define S_LMV          0x00800000 /* b_new_cmd */
 /* unused */
 /* unused */
 #define S_LMV          0x00800000 /* b_new_cmd */
 /* unused */
@@ -119,8 +119,8 @@ struct ptldebug_header {
 #define LIBCFS_DEBUG_SUBSYS_NAMES {                                    \
        "undefined", "mdc", "mds", "osc", "ost", "class", "log",        \
        "llite", "rpc", "mgmt", "lnet", "lnd", "pinger", "filter", "",  \
 #define LIBCFS_DEBUG_SUBSYS_NAMES {                                    \
        "undefined", "mdc", "mds", "osc", "ost", "class", "log",        \
        "llite", "rpc", "mgmt", "lnet", "lnd", "pinger", "filter", "",  \
-       "echo", "ldlm", "lov", "lquota", "osd", "lfsck", "", "", "lmv", \
-        "", "sec", "gss", "", "mgc", "mgs", "fid", "fld", NULL }
+       "echo", "ldlm", "lov", "lquota", "osd", "lfsck", "snapshot", "",\
+       "lmv",  "", "sec", "gss", "", "mgc", "mgs", "fid", "fld", NULL }
 
 /* Debugging masks (32 bits, non-overlapping) */
 #define D_TRACE                0x00000001 /* ENTRY/EXIT markers */
 
 /* Debugging masks (32 bits, non-overlapping) */
 #define D_TRACE                0x00000001 /* ENTRY/EXIT markers */
@@ -153,13 +153,14 @@ struct ptldebug_header {
 #define D_SEC          0x08000000
 #define D_LFSCK                0x10000000 /* For both OI scrub and LFSCK */
 #define D_HSM          0x20000000
 #define D_SEC          0x08000000
 #define D_LFSCK                0x10000000 /* For both OI scrub and LFSCK */
 #define D_HSM          0x20000000
+#define D_SNAPSHOT     0x40000000 /* snapshot */
 
 #define LIBCFS_DEBUG_MASKS_NAMES {                                     \
        "trace", "inode", "super", "ext2", "malloc", "cache", "info",   \
        "ioctl", "neterror", "net", "warning", "buffs", "other",        \
        "dentry", "nettrace", "page", "dlmtrace", "error", "emerg",     \
        "ha", "rpctrace", "vfstrace", "reada", "mmap", "config",        \
 
 #define LIBCFS_DEBUG_MASKS_NAMES {                                     \
        "trace", "inode", "super", "ext2", "malloc", "cache", "info",   \
        "ioctl", "neterror", "net", "warning", "buffs", "other",        \
        "dentry", "nettrace", "page", "dlmtrace", "error", "emerg",     \
        "ha", "rpctrace", "vfstrace", "reada", "mmap", "config",        \
-       "console", "quota", "sec", "lfsck", "hsm", NULL }
+       "console", "quota", "sec", "lfsck", "hsm", "snapshot", NULL }
 
 #define D_CANTMASK   (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE)
 
 
 #define D_CANTMASK   (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE)
 
index 9074ca4..0ca7327 100644 (file)
@@ -45,6 +45,7 @@ EXTRA_DIST = \
        lu_object.h \
        lu_ref.h \
        lustre_acl.h \
        lu_object.h \
        lu_ref.h \
        lustre_acl.h \
+       lustre_barrier.h \
        lustre_cfg.h \
        lustre_compat.h \
        lustre_debug.h \
        lustre_cfg.h \
        lustre_compat.h \
        lustre_debug.h \
diff --git a/lustre/include/lustre_barrier.h b/lustre/include/lustre_barrier.h
new file mode 100644 (file)
index 0000000..13ed590
--- /dev/null
@@ -0,0 +1,42 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2016, Intel Corporation.
+ *
+ * lustre/include/lustre_barrier.h
+ *
+ * Lustre write barrier (on MDT) exported functions.
+ *
+ * Author: Fan, Yong <fan.yong@intel.com>
+ */
+
+#ifndef _LUSTRE_BARRIER_H
+# define _LUSTRE_BARRIER_H
+
+#include <dt_object.h>
+#include <lustre_export.h>
+
+int barrier_handler(struct dt_device *key, struct ptlrpc_request *req);
+int barrier_register(struct dt_device *key, struct dt_device *next);
+void barrier_deregister(struct dt_device *key);
+
+#endif /* _LUSTRE_BARRIER_H */
index 76f0884..71be4e2 100644 (file)
@@ -1385,6 +1385,7 @@ static int lod_sync(const struct lu_env *env, struct dt_device *dev)
 {
        struct lod_device   *lod = dt2lod_dev(dev);
        struct lod_ost_desc *ost;
 {
        struct lod_device   *lod = dt2lod_dev(dev);
        struct lod_ost_desc *ost;
+       struct lod_mdt_desc *mdt;
        unsigned int         i;
        int                  rc = 0;
        ENTRY;
        unsigned int         i;
        int                  rc = 0;
        ENTRY;
@@ -1395,12 +1396,29 @@ static int lod_sync(const struct lu_env *env, struct dt_device *dev)
                LASSERT(ost && ost->ltd_ost);
                rc = dt_sync(env, ost->ltd_ost);
                if (rc) {
                LASSERT(ost && ost->ltd_ost);
                rc = dt_sync(env, ost->ltd_ost);
                if (rc) {
-                       CERROR("%s: can't sync %u: %d\n",
+                       CERROR("%s: can't sync ost %u: %d\n",
                               lod2obd(lod)->obd_name, i, rc);
                        break;
                }
        }
        lod_putref(lod, &lod->lod_ost_descs);
                               lod2obd(lod)->obd_name, i, rc);
                        break;
                }
        }
        lod_putref(lod, &lod->lod_ost_descs);
+
+       if (rc)
+               RETURN(rc);
+
+       lod_getref(&lod->lod_mdt_descs);
+       lod_foreach_mdt(lod, i) {
+               mdt = MDT_TGT(lod, i);
+               LASSERT(mdt && mdt->ltd_mdt);
+               rc = dt_sync(env, mdt->ltd_mdt);
+               if (rc) {
+                       CERROR("%s: can't sync mdt %u: %d\n",
+                              lod2obd(lod)->obd_name, i, rc);
+                       break;
+               }
+       }
+       lod_putref(lod, &lod->lod_mdt_descs);
+
        if (rc == 0)
                rc = dt_sync(env, lod->lod_child);
 
        if (rc == 0)
                rc = dt_sync(env, lod->lod_child);
 
index 88378b1..57fe3de 100644 (file)
@@ -48,6 +48,7 @@
 #include <lustre_param.h>
 #include <lustre_fid.h>
 #include <lustre_nodemap.h>
 #include <lustre_param.h>
 #include <lustre_fid.h>
 #include <lustre_nodemap.h>
+#include <lustre_barrier.h>
 
 #include "mdd_internal.h"
 
 
 #include "mdd_internal.h"
 
@@ -874,6 +875,7 @@ static int mdd_hsm_actions_llog_fini(const struct lu_env *env,
 static void mdd_device_shutdown(const struct lu_env *env, struct mdd_device *m,
                                struct lustre_cfg *cfg)
 {
 static void mdd_device_shutdown(const struct lu_env *env, struct mdd_device *m,
                                struct lustre_cfg *cfg)
 {
+       barrier_deregister(m->mdd_bottom);
        lfsck_degister(env, m->mdd_bottom);
        mdd_hsm_actions_llog_fini(env, m);
        mdd_changelog_fini(env, m);
        lfsck_degister(env, m->mdd_bottom);
        mdd_hsm_actions_llog_fini(env, m);
        mdd_changelog_fini(env, m);
@@ -1074,8 +1076,17 @@ static int mdd_prepare(const struct lu_env *env,
                GOTO(out_nodemap, rc);
        }
 
                GOTO(out_nodemap, rc);
        }
 
+       rc = barrier_register(mdd->mdd_bottom, mdd->mdd_child);
+       if (rc) {
+               CERROR("%s: failed to register to barrier: rc = %d\n",
+                      mdd2obd_dev(mdd)->obd_name, rc);
+               GOTO(out_lfsck, rc);
+       }
+
        RETURN(0);
 
        RETURN(0);
 
+out_lfsck:
+       lfsck_degister(env, mdd->mdd_bottom);
 out_nodemap:
        nm_config_file_deregister_tgt(env, mdd2obd_dev(mdd)->u.obt.obt_nodemap_config_file);
        mdd2obd_dev(mdd)->u.obt.obt_nodemap_config_file = NULL;
 out_nodemap:
        nm_config_file_deregister_tgt(env, mdd2obd_dev(mdd)->u.obt.obt_nodemap_config_file);
        mdd2obd_dev(mdd)->u.obt.obt_nodemap_config_file = NULL;
index 423a390..da1c3ce 100644 (file)
@@ -48,6 +48,7 @@
 #include <lustre_nodemap.h>
 #include <lustre_swab.h>
 #include <obd_class.h>
 #include <lustre_nodemap.h>
 #include <lustre_swab.h>
 #include <obd_class.h>
+#include <lustre_barrier.h>
 
 #include "mgc_internal.h"
 
 
 #include "mgc_internal.h"
 
@@ -1836,8 +1837,17 @@ out:
 
 static int mgc_barrier_glimpse_ast(struct ldlm_lock *lock, void *data)
 {
 
 static int mgc_barrier_glimpse_ast(struct ldlm_lock *lock, void *data)
 {
-       /* XXX: It will be implemented in subsequent patch. */
-       return 0;
+       struct config_llog_data *cld = lock->l_ast_data;
+       int rc;
+       ENTRY;
+
+       if (cld->cld_stopping)
+               RETURN(-ENODEV);
+
+       rc = barrier_handler(s2lsi(cld->cld_cfg.cfg_sb)->lsi_dt_dev,
+                            (struct ptlrpc_request *)data);
+
+       RETURN(rc);
 }
 
 /* Copy a remote log locally */
 }
 
 /* Copy a remote log locally */
index 488f757..2426dfa 100644 (file)
@@ -28,7 +28,7 @@ nodemap_objs := nodemap_handler.o nodemap_lproc.o nodemap_range.o
 nodemap_objs += nodemap_idmap.o nodemap_rbtree.o nodemap_member.o
 nodemap_objs += nodemap_storage.o
 
 nodemap_objs += nodemap_idmap.o nodemap_rbtree.o nodemap_member.o
 nodemap_objs += nodemap_storage.o
 
-ptlrpc-objs := $(ldlm_objs) $(ptlrpc_objs)
+ptlrpc-objs := $(ldlm_objs) $(ptlrpc_objs) $(TARGET)barrier.o
 @SERVER_TRUE@ptlrpc-objs += $(target_objs) $(nodemap_objs)
 
 @GSS_TRUE@subdir-m += gss
 @SERVER_TRUE@ptlrpc-objs += $(target_objs) $(nodemap_objs)
 
 @GSS_TRUE@subdir-m += gss
@@ -50,6 +50,9 @@ tgt_%.c: @LUSTRE@/target/tgt_%.c
 out_%.c: @LUSTRE@/target/out_%.c
        ln -sf $< $@
 
 out_%.c: @LUSTRE@/target/out_%.c
        ln -sf $< $@
 
+barrier.c: @LUSTRE@/target/barrier.c
+       ln -sf $< $@
+
 EXTRA_DIST := $(ptlrpc_objs:.o=.c) ptlrpc_internal.h
 EXTRA_DIST += $(nodemap_objs:.o=.c) nodemap_internal.h
 
 EXTRA_DIST := $(ptlrpc_objs:.o=.c) ptlrpc_internal.h
 EXTRA_DIST += $(nodemap_objs:.o=.c) nodemap_internal.h
 
index 32e978b..c090bdd 100644 (file)
@@ -32,7 +32,7 @@
 
 MOSTLYCLEANFILES := @MOSTLYCLEANFILES@
 EXTRA_DIST = tgt_main.c tgt_lastrcvd.c tgt_handler.c tgt_internal.h \
 
 MOSTLYCLEANFILES := @MOSTLYCLEANFILES@
 EXTRA_DIST = tgt_main.c tgt_lastrcvd.c tgt_handler.c tgt_internal.h \
-            out_handler.c out_lib.c
+            out_handler.c out_lib.c barrier.c
 EXTRA_DIST += update_trans.c
 EXTRA_DIST += update_records.c
 EXTRA_DIST += update_recovery.c
 EXTRA_DIST += update_trans.c
 EXTRA_DIST += update_records.c
 EXTRA_DIST += update_recovery.c
diff --git a/lustre/target/barrier.c b/lustre/target/barrier.c
new file mode 100644 (file)
index 0000000..3710908
--- /dev/null
@@ -0,0 +1,368 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2016, Intel Corporation.
+ *
+ * lustre/target/barrier.c
+ *
+ * Currently, the Lustre barrier is implemented as write barrier on all MDTs.
+ * For each MDT in the system, when it starts, it registers a barrier instance
+ * that will be used in handling subsequent barrier requests.
+ *
+ * Author: Fan, Yong <fan.yong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SNAPSHOT
+
+#include <linux/percpu_counter.h>
+
+#include <lustre/lustre_idl.h>
+#include <dt_object.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_barrier.h>
+#include <lustre/lustre_barrier_user.h>
+
+static LIST_HEAD(barrier_instance_list);
+static DEFINE_SPINLOCK(barrier_instance_lock);
+
+struct barrier_instance {
+       struct list_head         bi_link;
+       struct dt_device        *bi_bottom;
+       struct dt_device        *bi_next;
+       wait_queue_head_t        bi_waitq;
+       rwlock_t                 bi_rwlock;
+       struct percpu_counter    bi_writers;
+       atomic_t                 bi_ref;
+       time_t                   bi_deadline;
+       __u32                    bi_status;
+};
+
+static inline char *barrier_barrier2name(struct barrier_instance *barrier)
+{
+       return barrier->bi_bottom->dd_lu_dev.ld_obd->obd_name;
+}
+
+static inline __u32 barrier_dev_idx(struct barrier_instance *barrier)
+{
+       return lu_site2seq(barrier->bi_bottom->dd_lu_dev.ld_site)->ss_node_id;
+}
+
+static void barrier_instance_cleanup(struct barrier_instance *barrier)
+{
+       LASSERT(list_empty(&barrier->bi_link));
+
+       percpu_counter_destroy(&barrier->bi_writers);
+       OBD_FREE_PTR(barrier);
+}
+
+static inline void barrier_instance_put(struct barrier_instance *barrier)
+{
+       if (atomic_dec_and_test(&barrier->bi_ref))
+               barrier_instance_cleanup(barrier);
+}
+
+static struct barrier_instance *
+barrier_instance_find_locked(struct dt_device *key)
+{
+       struct barrier_instance *barrier;
+
+       list_for_each_entry(barrier, &barrier_instance_list, bi_link) {
+               if (barrier->bi_bottom == key)
+                       return barrier;
+       }
+
+       return NULL;
+}
+
+static void barrier_instance_add(struct barrier_instance *barrier)
+{
+       struct barrier_instance *tmp;
+
+       spin_lock(&barrier_instance_lock);
+       tmp = barrier_instance_find_locked(barrier->bi_bottom);
+       LASSERT(!tmp);
+
+       list_add_tail(&barrier->bi_link, &barrier_instance_list);
+       spin_unlock(&barrier_instance_lock);
+}
+
+static struct barrier_instance *barrier_instance_find(struct dt_device *key)
+{
+       struct barrier_instance *barrier;
+
+       spin_lock(&barrier_instance_lock);
+       barrier = barrier_instance_find_locked(key);
+       if (barrier)
+               atomic_inc(&barrier->bi_ref);
+       spin_unlock(&barrier_instance_lock);
+
+       return barrier;
+}
+
+static void barrier_set(struct barrier_instance *barrier, __u32 status)
+{
+       if (barrier->bi_status != status) {
+               CDEBUG(D_SNAPSHOT, "%s: change barrier status from %u to %u\n",
+                      barrier_barrier2name(barrier),
+                      barrier->bi_status, status);
+
+               barrier->bi_status = status;
+       }
+}
+
+/**
+ * Create the barrier for the given instance.
+ *
+ * We use two-phases barrier to guarantee that after the barrier setup:
+ * 1) All the MDT side pending async modification have been flushed.
+ * 2) Any subsequent modification will be blocked.
+ * 3) All async transactions on the MDTs have been committed.
+ *
+ * For phase1, we do the following:
+ *
+ * Firstly, it sets barrier flag on the instance that will block subsequent
+ * modifications from clients. (Note: server sponsored modification will be
+ * allowed for flush pending modifications)
+ *
+ * Secondly, it will flush all pending modification via dt_sync(), such as
+ * async OST-object destroy, async OST-object owner changes, and so on.
+ *
+ * If there are some on-handling clients sponsored modifications during the
+ * barrier freezing, then related modifications may cause pending requests
+ * after the first dt_sync(), so call dt_sync() again after all on-handling
+ * modifications done.
+ *
+ * With the phase1 barrier set, all pending cross-servers modification have
+ * been flushed to remote servers, and any new modification will be blocked.
+ * But it does not guarantees that all the updates have been committed to
+ * storage on remote servers. So when all the instances have done phase1
+ * barrier successfully, the MGS will notify all instances to do the phase2
+ * barrier as following:
+ *
+ * Every barrier instance will call dt_sync() to make all async transactions
+ * to be committed locally.
+ *
+ * \param[in] env      pointer to the thread context
+ * \param[in] barrier  pointer to the barrier instance
+ * \param[in] phase1   indicate whether it is phase1 barrier or not
+ *
+ * \retval             positive number for timeout
+ * \retval             0 for success
+ * \retval             negative error number on failure
+ */
+static int barrier_freeze(const struct lu_env *env,
+                         struct barrier_instance *barrier, bool phase1)
+{
+       int left;
+       int rc = 0;
+       __s64 inflight = 0;
+       ENTRY;
+
+       write_lock(&barrier->bi_rwlock);
+       barrier_set(barrier, phase1 ? BS_FREEZING_P1 : BS_FREEZING_P2);
+
+       /* Avoid out-of-order execution the barrier_set()
+        * and the check of inflight modifications count. */
+       smp_mb();
+
+       if (phase1)
+               inflight = percpu_counter_sum(&barrier->bi_writers);
+       write_unlock(&barrier->bi_rwlock);
+
+       rc = dt_sync(env, barrier->bi_next);
+       if (rc)
+               RETURN(rc);
+
+       LASSERT(barrier->bi_deadline != 0);
+
+       left = barrier->bi_deadline - cfs_time_current_sec();
+       if (left <= 0)
+               RETURN(1);
+
+       if (phase1 && inflight != 0) {
+               struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(left),
+                                                    NULL, NULL);
+
+               rc = l_wait_event(barrier->bi_waitq,
+                                 percpu_counter_sum(&barrier->bi_writers) == 0,
+                                 &lwi);
+               if (rc)
+                       RETURN(1);
+
+               /* sync again after all inflight modifications done. */
+               rc = dt_sync(env, barrier->bi_next);
+               if (rc)
+                       RETURN(rc);
+
+               if (cfs_time_beforeq(barrier->bi_deadline,
+                                    cfs_time_current_sec()))
+                       RETURN(1);
+       }
+
+       CDEBUG(D_SNAPSHOT, "%s: barrier freezing %s done.\n",
+              barrier_barrier2name(barrier), phase1 ? "phase1" : "phase2");
+
+       if (!phase1)
+               barrier_set(barrier, BS_FROZEN);
+
+       RETURN(0);
+}
+
+void barrier_init(void)
+{
+}
+
+void barrier_fini(void)
+{
+       LASSERT(list_empty(&barrier_instance_list));
+}
+
+int barrier_handler(struct dt_device *key, struct ptlrpc_request *req)
+{
+       struct ldlm_gl_barrier_desc *desc;
+       struct barrier_instance *barrier;
+       struct barrier_lvb *lvb;
+       struct lu_env env;
+       int rc = 0;
+       ENTRY;
+
+       /* glimpse on barrier locks always packs a glimpse descriptor */
+       req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_DESC_CALLBACK);
+       desc = req_capsule_client_get(&req->rq_pill, &RMF_DLM_GL_DESC);
+       if (!desc)
+               GOTO(out, rc = -EPROTO);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+                             sizeof(struct barrier_lvb));
+       rc = req_capsule_server_pack(&req->rq_pill);
+       if (rc)
+               GOTO(out, rc);
+
+       lvb = req_capsule_server_get(&req->rq_pill, &RMF_DLM_LVB);
+       barrier = barrier_instance_find(key);
+       if (!barrier)
+               GOTO(out, rc = -ENODEV);
+
+       rc = lu_env_init(&env, LCT_MD_THREAD | LCT_DT_THREAD);
+       if (rc)
+               GOTO(out_barrier, rc);
+
+       CDEBUG(D_SNAPSHOT,
+              "%s: handling barrier request: status %u, timeout %u\n",
+              barrier_barrier2name(barrier),
+              desc->lgbd_status, desc->lgbd_timeout);
+
+       switch (desc->lgbd_status) {
+       case BS_RESCAN:
+               barrier_set(barrier, BS_INIT);
+               break;
+       case BS_FREEZING_P1:
+       case BS_FREEZING_P2:
+               barrier->bi_deadline = cfs_time_current_sec() +
+                                       desc->lgbd_timeout;
+               rc = barrier_freeze(&env, barrier,
+                                   desc->lgbd_status == BS_FREEZING_P1);
+               break;
+       case BS_THAWING:
+       case BS_FAILED:
+       case BS_EXPIRED:
+               barrier_set(barrier, BS_THAWED);
+               break;
+       default:
+               CWARN("%s: unexpected barrier status %u\n",
+                     barrier_barrier2name(barrier), desc->lgbd_status);
+               rc = -EINVAL;
+               break;
+       }
+
+       GOTO(fini, rc);
+
+fini:
+       lu_env_fini(&env);
+
+out_barrier:
+       if (rc < 0)
+               barrier_set(barrier, BS_FAILED);
+       else if (rc > 0)
+               barrier_set(barrier, BS_EXPIRED);
+
+       lvb->lvb_status = barrier->bi_status;
+       lvb->lvb_index = barrier_dev_idx(barrier);
+
+       CDEBUG(D_SNAPSHOT, "%s: handled barrier request: status %u, "
+              "deadline %lu: rc = %d\n", barrier_barrier2name(barrier),
+              lvb->lvb_status, barrier->bi_deadline, rc);
+
+       barrier_instance_put(barrier);
+       rc = 0;
+
+out:
+       req->rq_status = rc;
+       return rc;
+}
+EXPORT_SYMBOL(barrier_handler);
+
+int barrier_register(struct dt_device *key, struct dt_device *next)
+{
+       struct barrier_instance *barrier;
+       int rc;
+       ENTRY;
+
+       OBD_ALLOC_PTR(barrier);
+       if (!barrier)
+               RETURN(-ENOMEM);
+
+       INIT_LIST_HEAD(&barrier->bi_link);
+       barrier->bi_bottom = key;
+       barrier->bi_next = next;
+       init_waitqueue_head(&barrier->bi_waitq);
+       rwlock_init(&barrier->bi_rwlock);
+       atomic_set(&barrier->bi_ref, 1);
+#ifdef HAVE_PERCPU_COUNTER_INIT_GFP_FLAG
+       rc = percpu_counter_init(&barrier->bi_writers, 0, GFP_KERNEL);
+#else
+       rc = percpu_counter_init(&barrier->bi_writers, 0);
+#endif
+       if (rc)
+               barrier_instance_cleanup(barrier);
+       else
+               barrier_instance_add(barrier);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(barrier_register);
+
+void barrier_deregister(struct dt_device *key)
+{
+       struct barrier_instance *barrier;
+
+       spin_lock(&barrier_instance_lock);
+       barrier = barrier_instance_find_locked(key);
+       if (barrier)
+               list_del_init(&barrier->bi_link);
+       spin_unlock(&barrier_instance_lock);
+
+       if (barrier)
+               barrier_instance_put(barrier);
+}
+EXPORT_SYMBOL(barrier_deregister);
index c378d85..bc9cbbc 100644 (file)
@@ -285,4 +285,6 @@ int top_trans_create_tmt(const struct lu_env *env,
                         struct top_thandle *top_th);
 
 void tgt_cancel_slc_locks(struct lu_target *tgt, __u64 transno);
                         struct top_thandle *top_th);
 
 void tgt_cancel_slc_locks(struct lu_target *tgt, __u64 transno);
+void barrier_init(void);
+void barrier_fini(void);
 #endif /* _TG_INTERNAL_H */
 #endif /* _TG_INTERNAL_H */
index 0bf062e..2f96846 100644 (file)
@@ -411,6 +411,7 @@ int tgt_mod_init(void)
 
        tgt_ses_key_init_generic(&tgt_session_key, NULL);
        lu_context_key_register_many(&tgt_session_key, NULL);
 
        tgt_ses_key_init_generic(&tgt_session_key, NULL);
        lu_context_key_register_many(&tgt_session_key, NULL);
+       barrier_init();
 
        update_info_init();
 
 
        update_info_init();
 
@@ -419,6 +420,7 @@ int tgt_mod_init(void)
 
 void tgt_mod_exit(void)
 {
 
 void tgt_mod_exit(void)
 {
+       barrier_fini();
        if (tgt_page_to_corrupt != NULL)
                put_page(tgt_page_to_corrupt);
 
        if (tgt_page_to_corrupt != NULL)
                put_page(tgt_page_to_corrupt);