4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License version 2 for more details. A copy is
14 * included in the COPYING file that accompanied this code.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 * Copyright (c) 2017, Intel Corporation.
25 * lustre/mgs/mgs_barrier.c
27 * Author: Fan, Yong <fan.yong@intel.com>
30 #define DEBUG_SUBSYSTEM S_MGS
31 #define D_MGS D_CONFIG
33 #include <uapi/linux/lustre/lustre_ioctl.h>
34 #include <lustre_swab.h>
35 #include <uapi/linux/lustre/lustre_barrier_user.h>
37 #include "mgs_internal.h"
40 * Handle the barrier lock glimpse reply.
42 * The barrier lock glimpse reply contains the target MDT's index and
43 * the barrier operation status on such MDT. With such infomation. If
44 * the MDT given barrier status is the expected one, then set related
45 * 'fsdb''s barrier bitmap; otherwise record the failure or status.
47 * \param[in] env pointer to the thread context
48 * \param[in] req pointer to the glimpse callback RPC request
49 * \param[in] data pointer the async glimpse callback data
50 * \param[in] rc the glimpse callback RPC return value
52 * \retval 0 for success
53 * \retval negative error number on failure
55 static int mgs_barrier_gl_interpret_reply(const struct lu_env *env,
56 struct ptlrpc_request *req,
59 struct ldlm_cb_async_args *ca = data;
60 struct fs_db *fsdb = ca->ca_set_arg->gl_interpret_data;
61 struct barrier_lvb *lvb;
66 /* The lock is useless, cancel it. */
67 ldlm_lock_cancel(ca->ca_lock);
74 lvb = req_capsule_server_swab_get(&req->rq_pill, &RMF_DLM_LVB,
75 lustre_swab_barrier_lvb);
77 GOTO(out, rc = -EPROTO);
79 if (lvb->lvb_status == fsdb->fsdb_barrier_expected) {
80 if (unlikely(lvb->lvb_index > INDEX_MAP_SIZE))
83 set_bit(lvb->lvb_index, fsdb->fsdb_barrier_map);
84 } else if (likely(!test_bit(lvb->lvb_index, fsdb->fsdb_barrier_map))) {
85 fsdb->fsdb_barrier_result = lvb->lvb_status;
92 fsdb->fsdb_barrier_result = rc;
98 * Send glimpse callback to the barrier locks holders.
100 * The glimpse callback takes the current barrier status. The barrier locks
101 * holders (on the MDTs) will take related barrier actions according to the
102 * given barrier status, then return their local barrier status.
104 * \param[in] env pointer to the thread context
105 * \param[in] mgs pointer to the MGS device
106 * \param[in] fsdb pointer the barrier 'fsdb'
107 * \param[in] timeout indicate when the barrier will be expired
108 * \param[in] expected the expected barrier status on remote servers (MDTs)
110 * \retval positive number for unexpected barrier status
111 * \retval 0 for success
112 * \retval negative error number on failure
114 static int mgs_barrier_glimpse_lock(const struct lu_env *env,
115 struct mgs_device *mgs,
117 __u32 timeout, __u32 expected)
119 union ldlm_gl_desc *desc = &mgs_env_info(env)->mgi_gl_desc;
120 struct ldlm_res_id res_id;
121 struct ldlm_resource *res;
122 struct ldlm_glimpse_work *work;
123 struct ldlm_glimpse_work *tmp;
124 struct list_head gl_list = LIST_HEAD_INIT(gl_list);
125 struct list_head *pos;
130 LASSERT(fsdb->fsdb_mdt_count > 0);
132 rc = mgc_logname2resid(fsdb->fsdb_name, &res_id, CONFIG_T_BARRIER);
136 res = ldlm_resource_get(mgs->mgs_obd->obd_namespace, NULL, &res_id,
139 RETURN(PTR_ERR(res));
141 fsdb->fsdb_barrier_result = 0;
142 fsdb->fsdb_barrier_expected = expected;
143 desc->barrier_desc.lgbd_status = fsdb->fsdb_barrier_status;
144 desc->barrier_desc.lgbd_timeout = timeout;
147 list_for_each_entry(work, &gl_list, gl_list) {
151 LDLM_LOCK_RELEASE(work->gl_lock);
152 work->gl_lock = NULL;
155 /* It is not big issue to alloc more work item than needed. */
156 for (i = 0; i < fsdb->fsdb_mdt_count; i++) {
159 GOTO(out, rc = -ENOMEM);
161 list_add_tail(&work->gl_list, &gl_list);
164 work = list_entry(gl_list.next, struct ldlm_glimpse_work, gl_list);
167 list_for_each(pos, &res->lr_granted) {
168 struct ldlm_lock *lock = list_entry(pos, struct ldlm_lock,
171 work->gl_lock = LDLM_LOCK_GET(lock);
173 work->gl_desc = desc;
174 work->gl_interpret_reply = mgs_barrier_gl_interpret_reply;
175 work->gl_interpret_data = fsdb;
177 if (unlikely(work->gl_list.next == &gl_list)) {
178 if (likely(pos->next == &res->lr_granted))
182 /* The granted locks are more than the MDTs count. */
186 work = list_entry(work->gl_list.next, struct ldlm_glimpse_work,
191 /* The MDTs count may be more than the granted locks. */
192 list_for_each_entry_safe_reverse(work, tmp, &gl_list, gl_list) {
196 list_del(&work->gl_list);
200 if (!list_empty(&gl_list))
201 rc = ldlm_glimpse_locks(res, &gl_list);
208 list_for_each_entry_safe(work, tmp, &gl_list, gl_list) {
209 list_del(&work->gl_list);
211 LDLM_LOCK_RELEASE(work->gl_lock);
215 ldlm_resource_putref(res);
217 rc = fsdb->fsdb_barrier_result;
222 static void mgs_barrier_bitmap_setup(struct mgs_device *mgs,
223 struct fs_db *b_fsdb,
226 struct fs_db *c_fsdb;
228 c_fsdb = mgs_find_fsdb(mgs, name);
229 if (likely(c_fsdb)) {
230 memcpy(b_fsdb->fsdb_mdt_index_map,
231 c_fsdb->fsdb_mdt_index_map, INDEX_MAP_SIZE);
232 b_fsdb->fsdb_mdt_count = c_fsdb->fsdb_mdt_count;
233 mgs_put_fsdb(mgs, c_fsdb);
237 static bool mgs_barrier_done(struct fs_db *fsdb)
241 for (i = 0; i < INDEX_MAP_SIZE * 8; i++) {
242 if (test_bit(i, fsdb->fsdb_mdt_index_map) &&
243 !test_bit(i, fsdb->fsdb_barrier_map))
250 bool mgs_barrier_expired(struct fs_db *fsdb, time64_t timeout)
252 time64_t expired = fsdb->fsdb_barrier_latest_create_time + timeout;
254 return expired > ktime_get_real_seconds();
258 * Create the barrier for the given instance.
260 * We use two-phases barrier to guarantee that after the barrier setup:
261 * 1) All the server side pending async modification RPCs have been flushed.
262 * 2) Any subsequent modification will be blocked.
263 * 3) All async transactions on the MDTs have been committed.
265 * For phase1, we do the following:
267 * Firstly, it sets barrier flag on the instance that will block subsequent
268 * modifications from clients. (Note: server sponsored modification will be
269 * allowed for flush pending modifications)
271 * Secondly, it will flush all pending modification via dt_sync(), such as
272 * async OST-object destroy, async OST-object owner changes, and so on.
274 * If there are some on-handling clients sponsored modifications during the
275 * barrier creating, then related modifications may cause pending requests
276 * after the first dt_sync(), so call dt_sync() again after all on-handling
277 * modifications done.
279 * With the phase1 barrier set, all pending cross-servers modification RPCs
280 * have been flushed to remote servers, and any new modification will be
281 * blocked. But it does not guarantees that all the updates have been
282 * committed to storage on remote servers. So when all the instances have
283 * done phase1 barrier successfully, the MGS will notify all instances to
284 * do the phase2 barrier as following:
286 * Every barrier instance will call dt_sync() to make all async transactions
287 * to be committed locally.
289 * \param[in] env pointer to the thread context
290 * \param[in] mgs pointer to the MGS device
291 * \param[in] bc pointer the barrier control structure
293 * \retval 0 for success
294 * \retval negative error number on failure
296 static int mgs_barrier_freeze(const struct lu_env *env,
297 struct mgs_device *mgs,
298 struct barrier_ctl *bc)
300 char *name = mgs_env_info(env)->mgi_fsname;
308 snprintf(name, sizeof(mgs_env_info(env)->mgi_fsname) - 1, "%s-%s",
309 bc->bc_name, BARRIER_FILENAME);
311 down_write(&mgs->mgs_barrier_rwsem);
312 mutex_lock(&mgs->mgs_mutex);
314 rc = mgs_find_or_make_fsdb_nolock(env, mgs, name, &fsdb);
316 mutex_unlock(&mgs->mgs_mutex);
317 up_write(&mgs->mgs_barrier_rwsem);
321 if (unlikely(fsdb->fsdb_mdt_count == 0)) {
322 mgs_barrier_bitmap_setup(mgs, fsdb, bc->bc_name);
324 /* fsdb was just created, ensure that fsdb_barrier_disabled is
326 if (fsdb->fsdb_mdt_count > 0) {
327 struct obd_export *exp;
328 struct obd_device *mgs_obd = mgs->mgs_obd;
330 spin_lock(&mgs_obd->obd_dev_lock);
331 list_for_each_entry(exp, &mgs_obd->obd_exports,
333 __u64 flags = exp_connect_flags(exp);
334 if (!!(flags & OBD_CONNECT_MDS_MDS) &&
335 !(flags & OBD_CONNECT_BARRIER)) {
336 fsdb->fsdb_barrier_disabled = 1;
340 spin_unlock(&mgs_obd->obd_dev_lock);
344 mutex_lock(&fsdb->fsdb_mutex);
345 mutex_unlock(&mgs->mgs_mutex);
347 switch (fsdb->fsdb_barrier_status) {
357 if (mgs_barrier_expired(fsdb, fsdb->fsdb_barrier_timeout)) {
365 if (fsdb->fsdb_barrier_disabled) {
367 } else if (unlikely(fsdb->fsdb_mdt_count == 0)) {
370 fsdb->fsdb_barrier_latest_create_time =
371 ktime_get_real_seconds();
372 fsdb->fsdb_barrier_status = BS_FREEZING_P1;
373 if (bc->bc_timeout != 0)
374 fsdb->fsdb_barrier_timeout = bc->bc_timeout;
376 fsdb->fsdb_barrier_timeout =
377 BARRIER_TIMEOUT_DEFAULT;
378 memset(fsdb->fsdb_barrier_map, 0, INDEX_MAP_SIZE);
382 LCONSOLE_WARN("%s: found unexpected barrier status %u\n",
383 bc->bc_name, fsdb->fsdb_barrier_status);
391 left = fsdb->fsdb_barrier_timeout;
394 mutex_unlock(&fsdb->fsdb_mutex);
395 up_write(&mgs->mgs_barrier_rwsem);
397 CFS_FAIL_TIMEOUT(OBD_FAIL_BARRIER_DELAY, cfs_fail_val);
399 rc = mgs_barrier_glimpse_lock(env, mgs, fsdb, left,
400 phase1 ? BS_FREEZING_P1 : BS_FROZEN);
401 down_write(&mgs->mgs_barrier_rwsem);
402 mutex_lock(&fsdb->fsdb_mutex);
405 left = fsdb->fsdb_barrier_latest_create_time +
406 fsdb->fsdb_barrier_timeout - ktime_get_real_seconds();
408 fsdb->fsdb_barrier_status = BS_EXPIRED;
410 GOTO(out, rc = -ETIME);
413 LASSERTF(fsdb->fsdb_barrier_status ==
414 (phase1 ? BS_FREEZING_P1 : BS_FREEZING_P2),
415 "unexpected barrier status %u\n",
416 fsdb->fsdb_barrier_status);
418 if (rc == -ETIMEDOUT) {
419 fsdb->fsdb_barrier_status = BS_EXPIRED;
422 fsdb->fsdb_barrier_status = rc;
425 fsdb->fsdb_barrier_status = BS_FAILED;
426 } else if (mgs_barrier_done(fsdb)) {
428 fsdb->fsdb_barrier_status = BS_FREEZING_P2;
429 memset(fsdb->fsdb_barrier_map, 0,
435 fsdb->fsdb_barrier_status = BS_FROZEN;
438 fsdb->fsdb_barrier_status = BS_FAILED;
445 mutex_unlock(&fsdb->fsdb_mutex);
446 up_write(&mgs->mgs_barrier_rwsem);
448 memset(fsdb->fsdb_barrier_map, 0, INDEX_MAP_SIZE);
449 mgs_barrier_glimpse_lock(env, mgs, fsdb, 0, BS_THAWED);
452 mgs_put_fsdb(mgs, fsdb);
457 static int mgs_barrier_thaw(const struct lu_env *env,
458 struct mgs_device *mgs,
459 struct barrier_ctl *bc)
461 char *name = mgs_env_info(env)->mgi_fsname;
466 snprintf(name, sizeof(mgs_env_info(env)->mgi_fsname) - 1, "%s-%s",
467 bc->bc_name, BARRIER_FILENAME);
469 down_write(&mgs->mgs_barrier_rwsem);
470 mutex_lock(&mgs->mgs_mutex);
472 rc = mgs_find_or_make_fsdb_nolock(env, mgs, name, &fsdb);
474 mutex_unlock(&mgs->mgs_mutex);
475 up_write(&mgs->mgs_barrier_rwsem);
479 if (unlikely(fsdb->fsdb_mdt_count == 0)) {
480 mgs_barrier_bitmap_setup(mgs, fsdb, bc->bc_name);
482 /* fsdb was just created, ensure that fsdb_barrier_disabled is
484 if (fsdb->fsdb_mdt_count > 0) {
485 struct obd_export *exp;
486 struct obd_device *mgs_obd = mgs->mgs_obd;
488 spin_lock(&mgs_obd->obd_dev_lock);
489 list_for_each_entry(exp, &mgs_obd->obd_exports,
491 __u64 flags = exp_connect_flags(exp);
492 if (!!(flags & OBD_CONNECT_MDS_MDS) &&
493 !(flags & OBD_CONNECT_BARRIER)) {
494 fsdb->fsdb_barrier_disabled = 1;
498 spin_unlock(&mgs_obd->obd_dev_lock);
502 mutex_lock(&fsdb->fsdb_mutex);
503 mutex_unlock(&mgs->mgs_mutex);
505 switch (fsdb->fsdb_barrier_status) {
519 case BS_EXPIRED: /* The barrier on some MDT(s) may be expired,
520 * but may be not on others. Destory anyway. */
522 if (unlikely(fsdb->fsdb_mdt_count == 0)) {
525 fsdb->fsdb_barrier_status = BS_THAWING;
526 memset(fsdb->fsdb_barrier_map, 0, INDEX_MAP_SIZE);
530 LCONSOLE_WARN("%s: found unexpected barrier status %u\n",
531 bc->bc_name, fsdb->fsdb_barrier_status);
539 mutex_unlock(&fsdb->fsdb_mutex);
540 up_write(&mgs->mgs_barrier_rwsem);
542 CFS_FAIL_TIMEOUT(OBD_FAIL_BARRIER_DELAY, cfs_fail_val);
544 rc = mgs_barrier_glimpse_lock(env, mgs, fsdb, 0, BS_THAWED);
545 down_write(&mgs->mgs_barrier_rwsem);
546 mutex_lock(&fsdb->fsdb_mutex);
548 LASSERTF(fsdb->fsdb_barrier_status == BS_THAWING,
549 "unexpected barrier status %u\n",
550 fsdb->fsdb_barrier_status);
553 fsdb->fsdb_barrier_status = rc;
556 fsdb->fsdb_barrier_status = BS_FAILED;
557 } else if (mgs_barrier_done(fsdb)) {
558 fsdb->fsdb_barrier_status = BS_THAWED;
560 fsdb->fsdb_barrier_status = BS_FAILED;
567 mutex_unlock(&fsdb->fsdb_mutex);
568 up_write(&mgs->mgs_barrier_rwsem);
569 mgs_put_fsdb(mgs, fsdb);
574 static int mgs_barrier_stat(const struct lu_env *env,
575 struct mgs_device *mgs,
576 struct barrier_ctl *bc)
578 char *name = mgs_env_info(env)->mgi_fsname;
582 snprintf(name, sizeof(mgs_env_info(env)->mgi_fsname) - 1, "%s-%s",
583 bc->bc_name, BARRIER_FILENAME);
585 mutex_lock(&mgs->mgs_mutex);
587 fsdb = mgs_find_fsdb(mgs, name);
589 mutex_lock(&fsdb->fsdb_mutex);
590 mutex_unlock(&mgs->mgs_mutex);
592 bc->bc_status = fsdb->fsdb_barrier_status;
593 if (bc->bc_status == BS_FREEZING_P1 ||
594 bc->bc_status == BS_FREEZING_P2 ||
595 bc->bc_status == BS_FROZEN) {
596 if (mgs_barrier_expired(fsdb, fsdb->fsdb_barrier_timeout))
598 fsdb->fsdb_barrier_latest_create_time +
599 fsdb->fsdb_barrier_timeout -
600 ktime_get_real_seconds();
602 bc->bc_status = fsdb->fsdb_barrier_status =
606 mutex_unlock(&fsdb->fsdb_mutex);
607 mgs_put_fsdb(mgs, fsdb);
609 mutex_unlock(&mgs->mgs_mutex);
611 bc->bc_status = BS_INIT;
617 static int mgs_barrier_rescan(const struct lu_env *env,
618 struct mgs_device *mgs,
619 struct barrier_ctl *bc)
621 char *name = mgs_env_info(env)->mgi_fsname;
622 struct fs_db *b_fsdb;
623 struct fs_db *c_fsdb;
627 down_write(&mgs->mgs_barrier_rwsem);
628 mutex_lock(&mgs->mgs_mutex);
630 c_fsdb = mgs_find_fsdb(mgs, bc->bc_name);
631 if (!c_fsdb || unlikely(c_fsdb->fsdb_mdt_count == 0)) {
632 mutex_unlock(&mgs->mgs_mutex);
633 up_write(&mgs->mgs_barrier_rwsem);
638 snprintf(name, sizeof(mgs_env_info(env)->mgi_fsname) - 1, "%s-%s",
639 bc->bc_name, BARRIER_FILENAME);
640 rc = mgs_find_or_make_fsdb_nolock(env, mgs, name, &b_fsdb);
642 mutex_unlock(&mgs->mgs_mutex);
643 up_write(&mgs->mgs_barrier_rwsem);
644 mgs_put_fsdb(mgs, c_fsdb);
648 if (unlikely(b_fsdb->fsdb_mdt_count == 0 &&
649 c_fsdb->fsdb_mdt_count > 0)) {
650 /* fsdb was just created, ensure that fsdb_barrier_disabled is
652 struct obd_export *exp;
653 struct obd_device *mgs_obd = mgs->mgs_obd;
655 spin_lock(&mgs_obd->obd_dev_lock);
656 list_for_each_entry(exp, &mgs_obd->obd_exports,
658 __u64 flags = exp_connect_flags(exp);
659 if (!!(flags & OBD_CONNECT_MDS_MDS) &&
660 !(flags & OBD_CONNECT_BARRIER)) {
661 b_fsdb->fsdb_barrier_disabled = 1;
665 spin_unlock(&mgs_obd->obd_dev_lock);
668 mutex_lock(&b_fsdb->fsdb_mutex);
669 mutex_lock(&c_fsdb->fsdb_mutex);
670 mutex_unlock(&mgs->mgs_mutex);
672 switch (b_fsdb->fsdb_barrier_status) {
682 if (mgs_barrier_expired(b_fsdb, b_fsdb->fsdb_barrier_timeout)) {
690 b_fsdb->fsdb_barrier_latest_create_time = ktime_get_real_seconds();
691 b_fsdb->fsdb_barrier_status = BS_RESCAN;
692 memcpy(b_fsdb->fsdb_mdt_index_map, c_fsdb->fsdb_mdt_index_map,
694 memset(b_fsdb->fsdb_barrier_map, 0, INDEX_MAP_SIZE);
695 b_fsdb->fsdb_mdt_count = c_fsdb->fsdb_mdt_count;
698 LCONSOLE_WARN("%s: found unexpected barrier status %u\n",
699 bc->bc_name, b_fsdb->fsdb_barrier_status);
704 mutex_unlock(&c_fsdb->fsdb_mutex);
705 mgs_put_fsdb(mgs, c_fsdb);
711 mutex_unlock(&b_fsdb->fsdb_mutex);
712 up_write(&mgs->mgs_barrier_rwsem);
713 rc = mgs_barrier_glimpse_lock(env, mgs, b_fsdb, 0, BS_INIT);
714 down_write(&mgs->mgs_barrier_rwsem);
715 mutex_lock(&b_fsdb->fsdb_mutex);
717 LASSERTF(b_fsdb->fsdb_barrier_status == BS_RESCAN,
718 "unexpected barrier status %u\n",
719 b_fsdb->fsdb_barrier_status);
722 b_fsdb->fsdb_barrier_status = rc;
724 } else if (rc == -ETIMEDOUT &&
725 mgs_barrier_expired(b_fsdb, bc->bc_timeout)) {
726 memset(b_fsdb->fsdb_barrier_map, 0, INDEX_MAP_SIZE);
729 } else if (rc < 0 && rc != -ETIMEDOUT && rc != -ENODEV) {
730 b_fsdb->fsdb_barrier_status = BS_FAILED;
734 b_fsdb->fsdb_mdt_count = 0;
738 for (i = 0; i < INDEX_MAP_SIZE * 8; i++) {
739 if (test_bit(i, b_fsdb->fsdb_barrier_map)) {
740 b_fsdb->fsdb_mdt_count++;
741 } else if (test_bit(i, b_fsdb->fsdb_mdt_index_map)) {
742 b_fsdb->fsdb_mdt_count++;
747 bc->bc_total = b_fsdb->fsdb_mdt_count;
748 memcpy(b_fsdb->fsdb_mdt_index_map,
749 b_fsdb->fsdb_barrier_map, INDEX_MAP_SIZE);
750 b_fsdb->fsdb_barrier_status = BS_INIT;
756 mutex_unlock(&b_fsdb->fsdb_mutex);
757 up_write(&mgs->mgs_barrier_rwsem);
758 mgs_put_fsdb(mgs, b_fsdb);
763 int mgs_iocontrol_barrier(const struct lu_env *env,
764 struct mgs_device *mgs,
765 struct obd_ioctl_data *data)
767 struct barrier_ctl *bc = (struct barrier_ctl *)(data->ioc_inlbuf1);
771 if (unlikely(bc->bc_version != BARRIER_VERSION_V1))
774 if (unlikely(bc->bc_name[0] == '\0' ||
775 strnlen(bc->bc_name, sizeof(bc->bc_name)) > 8))
778 /* NOT allow barrier operations during recovery. */
779 if (unlikely(mgs->mgs_obd->obd_recovering))
782 switch (bc->bc_cmd) {
784 rc = mgs_barrier_freeze(env, mgs, bc);
787 rc = mgs_barrier_thaw(env, mgs, bc);
790 rc = mgs_barrier_stat(env, mgs, bc);
793 rc = mgs_barrier_rescan(env, mgs, bc);