4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License version 2 for more details. A copy is
14 * included in the COPYING file that accompanied this code.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 * Copyright (c) 2016, Intel Corporation.
25 * lustre/mgs/mgs_barrier.c
27 * Author: Fan, Yong <fan.yong@intel.com>
30 #define DEBUG_SUBSYSTEM S_MGS
31 #define D_MGS D_CONFIG
33 #include <lustre_ioctl.h>
34 #include <lustre_swab.h>
35 #include <lustre/lustre_barrier_user.h>
37 #include "mgs_internal.h"
40 * Handle the barrier lock glimpse reply.
42 * The barrier lock glimpse reply contains the target MDT's index and
43 * the barrier operation status on such MDT. With such infomation. If
44 * the MDT given barrier status is the expected one, then set related
45 * 'fsdb''s barrier bitmap; otherwise record the failure or status.
47 * \param[in] env pointer to the thread context
48 * \param[in] req pointer to the glimpse callback RPC request
49 * \param[in] data pointer the async glimpse callback data
50 * \param[in] rc the glimpse callback RPC return value
52 * \retval 0 for success
53 * \retval negative error number on failure
55 static int mgs_barrier_gl_interpret_reply(const struct lu_env *env,
56 struct ptlrpc_request *req,
59 struct ldlm_cb_async_args *ca = data;
60 struct fs_db *fsdb = ca->ca_set_arg->gl_interpret_data;
61 struct barrier_lvb *lvb;
66 /* The lock is useless, cancel it. */
67 ldlm_lock_cancel(ca->ca_lock);
72 lvb = req_capsule_server_swab_get(&req->rq_pill, &RMF_DLM_LVB,
73 lustre_swab_barrier_lvb);
75 GOTO(out, rc = -EPROTO);
77 if (lvb->lvb_status == fsdb->fsdb_barrier_expected) {
78 if (unlikely(lvb->lvb_index > INDEX_MAP_SIZE))
81 set_bit(lvb->lvb_index, fsdb->fsdb_barrier_map);
82 } else if (likely(!test_bit(lvb->lvb_index, fsdb->fsdb_barrier_map))) {
83 fsdb->fsdb_barrier_result = lvb->lvb_status;
90 fsdb->fsdb_barrier_result = rc;
96 * Send glimpse callback to the barrier locks holders.
98 * The glimpse callback takes the current barrier status. The barrier locks
99 * holders (on the MDTs) will take related barrier actions according to the
100 * given barrier status, then return their local barrier status.
102 * \param[in] env pointer to the thread context
103 * \param[in] mgs pointer to the MGS device
104 * \param[in] fsdb pointer the barrier 'fsdb'
105 * \param[in] timeout indicate when the barrier will be expired
106 * \param[in] expected the expected barrier status on remote servers (MDTs)
108 * \retval positive number for unexpected barrier status
109 * \retval 0 for success
110 * \retval negative error number on failure
112 static int mgs_barrier_glimpse_lock(const struct lu_env *env,
113 struct mgs_device *mgs,
115 __u32 timeout, __u32 expected)
117 union ldlm_gl_desc *desc = &mgs_env_info(env)->mgi_gl_desc;
118 struct ldlm_res_id res_id;
119 struct ldlm_resource *res;
120 struct ldlm_glimpse_work *work;
121 struct ldlm_glimpse_work *tmp;
122 struct list_head gl_list = LIST_HEAD_INIT(gl_list);
123 struct list_head *pos;
128 LASSERT(fsdb->fsdb_mdt_count > 0);
130 rc = mgc_logname2resid(fsdb->fsdb_name, &res_id, CONFIG_T_BARRIER);
134 res = ldlm_resource_get(mgs->mgs_obd->obd_namespace, NULL, &res_id,
137 RETURN(PTR_ERR(res));
139 fsdb->fsdb_barrier_result = 0;
140 fsdb->fsdb_barrier_expected = expected;
141 desc->barrier_desc.lgbd_status = fsdb->fsdb_barrier_status;
142 desc->barrier_desc.lgbd_timeout = timeout;
145 list_for_each_entry(work, &gl_list, gl_list) {
149 LDLM_LOCK_RELEASE(work->gl_lock);
150 work->gl_lock = NULL;
153 /* It is not big issue to alloc more work item than needed. */
154 for (i = 0; i < fsdb->fsdb_mdt_count; i++) {
157 GOTO(out, rc = -ENOMEM);
159 list_add_tail(&work->gl_list, &gl_list);
162 work = list_entry(gl_list.next, struct ldlm_glimpse_work, gl_list);
165 list_for_each(pos, &res->lr_granted) {
166 struct ldlm_lock *lock = list_entry(pos, struct ldlm_lock,
169 work->gl_lock = LDLM_LOCK_GET(lock);
171 work->gl_desc = desc;
172 work->gl_interpret_reply = mgs_barrier_gl_interpret_reply;
173 work->gl_interpret_data = fsdb;
175 if (unlikely(work->gl_list.next == &gl_list)) {
176 if (likely(pos->next == &res->lr_granted))
180 /* The granted locks are more than the MDTs count. */
184 work = list_entry(work->gl_list.next, struct ldlm_glimpse_work,
189 /* The MDTs count may be more than the granted locks. */
190 list_for_each_entry_safe_reverse(work, tmp, &gl_list, gl_list) {
194 list_del(&work->gl_list);
198 if (!list_empty(&gl_list))
199 rc = ldlm_glimpse_locks(res, &gl_list);
206 list_for_each_entry_safe(work, tmp, &gl_list, gl_list) {
207 list_del(&work->gl_list);
209 LDLM_LOCK_RELEASE(work->gl_lock);
213 ldlm_resource_putref(res);
215 rc = fsdb->fsdb_barrier_result;
220 static void mgs_barrier_bitmap_setup(struct mgs_device *mgs,
221 struct fs_db *b_fsdb,
224 struct fs_db *c_fsdb;
226 c_fsdb = mgs_find_fsdb(mgs, name);
227 if (likely(c_fsdb)) {
228 memcpy(b_fsdb->fsdb_mdt_index_map,
229 c_fsdb->fsdb_mdt_index_map, INDEX_MAP_SIZE);
230 b_fsdb->fsdb_mdt_count = c_fsdb->fsdb_mdt_count;
231 mgs_put_fsdb(mgs, c_fsdb);
235 static bool mgs_barrier_done(struct fs_db *fsdb)
239 for (i = 0; i < INDEX_MAP_SIZE * 8; i++) {
240 if (test_bit(i, fsdb->fsdb_mdt_index_map) &&
241 !test_bit(i, fsdb->fsdb_barrier_map))
249 * Create the barrier for the given instance.
251 * We use two-phases barrier to guarantee that after the barrier setup:
252 * 1) All the server side pending async modification RPCs have been flushed.
253 * 2) Any subsequent modification will be blocked.
254 * 3) All async transactions on the MDTs have been committed.
256 * For phase1, we do the following:
258 * Firstly, it sets barrier flag on the instance that will block subsequent
259 * modifications from clients. (Note: server sponsored modification will be
260 * allowed for flush pending modifications)
262 * Secondly, it will flush all pending modification via dt_sync(), such as
263 * async OST-object destroy, async OST-object owner changes, and so on.
265 * If there are some on-handling clients sponsored modifications during the
266 * barrier creating, then related modifications may cause pending requests
267 * after the first dt_sync(), so call dt_sync() again after all on-handling
268 * modifications done.
270 * With the phase1 barrier set, all pending cross-servers modification RPCs
271 * have been flushed to remote servers, and any new modification will be
272 * blocked. But it does not guarantees that all the updates have been
273 * committed to storage on remote servers. So when all the instances have
274 * done phase1 barrier successfully, the MGS will notify all instances to
275 * do the phase2 barrier as following:
277 * Every barrier instance will call dt_sync() to make all async transactions
278 * to be committed locally.
280 * \param[in] env pointer to the thread context
281 * \param[in] mgs pointer to the MGS device
282 * \param[in] bc pointer the barrier control structure
284 * \retval 0 for success
285 * \retval negative error number on failure
287 static int mgs_barrier_freeze(const struct lu_env *env,
288 struct mgs_device *mgs,
289 struct barrier_ctl *bc)
291 char *name = mgs_env_info(env)->mgi_fsname;
299 snprintf(name, sizeof(mgs_env_info(env)->mgi_fsname) - 1, "%s-%s",
300 bc->bc_name, BARRIER_FILENAME);
302 down_write(&mgs->mgs_barrier_rwsem);
303 mutex_lock(&mgs->mgs_mutex);
305 fsdb = mgs_find_fsdb(mgs, name);
307 mutex_unlock(&mgs->mgs_mutex);
308 up_write(&mgs->mgs_barrier_rwsem);
313 if (unlikely(fsdb->fsdb_mdt_count == 0))
314 mgs_barrier_bitmap_setup(mgs, fsdb, bc->bc_name);
316 mutex_lock(&fsdb->fsdb_mutex);
317 mutex_unlock(&mgs->mgs_mutex);
319 switch (fsdb->fsdb_barrier_status) {
329 if (cfs_time_before(cfs_time_current_sec(),
330 fsdb->fsdb_barrier_latest_create_time +
331 fsdb->fsdb_barrier_timeout)) {
339 if (fsdb->fsdb_barrier_disabled) {
341 } else if (unlikely(fsdb->fsdb_mdt_count == 0)) {
344 fsdb->fsdb_barrier_latest_create_time =
345 cfs_time_current_sec();
346 fsdb->fsdb_barrier_status = BS_FREEZING_P1;
347 if (bc->bc_timeout != 0)
348 fsdb->fsdb_barrier_timeout = bc->bc_timeout;
350 fsdb->fsdb_barrier_timeout =
351 BARRIER_TIMEOUT_DEFAULT;
352 memset(fsdb->fsdb_barrier_map, 0, INDEX_MAP_SIZE);
356 LCONSOLE_WARN("%s: found unexpected barrier status %u\n",
357 bc->bc_name, fsdb->fsdb_barrier_status);
365 left = fsdb->fsdb_barrier_timeout;
368 mutex_unlock(&fsdb->fsdb_mutex);
369 up_write(&mgs->mgs_barrier_rwsem);
371 CFS_FAIL_TIMEOUT(OBD_FAIL_BARRIER_DELAY, cfs_fail_val);
373 rc = mgs_barrier_glimpse_lock(env, mgs, fsdb, left,
374 phase1 ? BS_FREEZING_P1 : BS_FROZEN);
375 down_write(&mgs->mgs_barrier_rwsem);
376 mutex_lock(&fsdb->fsdb_mutex);
379 left = fsdb->fsdb_barrier_latest_create_time +
380 fsdb->fsdb_barrier_timeout - cfs_time_current_sec();
382 fsdb->fsdb_barrier_status = BS_EXPIRED;
384 GOTO(out, rc = -ETIME);
387 LASSERTF(fsdb->fsdb_barrier_status ==
388 (phase1 ? BS_FREEZING_P1 : BS_FREEZING_P2),
389 "unexpected barrier status %u\n",
390 fsdb->fsdb_barrier_status);
392 if (rc == -ETIMEDOUT) {
393 fsdb->fsdb_barrier_status = BS_EXPIRED;
396 fsdb->fsdb_barrier_status = rc;
399 fsdb->fsdb_barrier_status = BS_FAILED;
400 } else if (mgs_barrier_done(fsdb)) {
402 fsdb->fsdb_barrier_status = BS_FREEZING_P2;
403 memset(fsdb->fsdb_barrier_map, 0,
409 fsdb->fsdb_barrier_status = BS_FROZEN;
412 fsdb->fsdb_barrier_status = BS_FAILED;
419 mutex_unlock(&fsdb->fsdb_mutex);
420 up_write(&mgs->mgs_barrier_rwsem);
422 memset(fsdb->fsdb_barrier_map, 0, INDEX_MAP_SIZE);
423 mgs_barrier_glimpse_lock(env, mgs, fsdb, 0, BS_THAWED);
426 mgs_put_fsdb(mgs, fsdb);
431 static int mgs_barrier_thaw(const struct lu_env *env,
432 struct mgs_device *mgs,
433 struct barrier_ctl *bc)
435 char *name = mgs_env_info(env)->mgi_fsname;
440 snprintf(name, sizeof(mgs_env_info(env)->mgi_fsname) - 1, "%s-%s",
441 bc->bc_name, BARRIER_FILENAME);
443 down_write(&mgs->mgs_barrier_rwsem);
444 mutex_lock(&mgs->mgs_mutex);
446 fsdb = mgs_find_fsdb(mgs, name);
448 mutex_unlock(&mgs->mgs_mutex);
449 up_write(&mgs->mgs_barrier_rwsem);
454 if (unlikely(fsdb->fsdb_mdt_count == 0))
455 mgs_barrier_bitmap_setup(mgs, fsdb, bc->bc_name);
457 mutex_lock(&fsdb->fsdb_mutex);
458 mutex_unlock(&mgs->mgs_mutex);
460 switch (fsdb->fsdb_barrier_status) {
474 case BS_EXPIRED: /* The barrier on some MDT(s) may be expired,
475 * but may be not on others. Destory anyway. */
477 if (unlikely(fsdb->fsdb_mdt_count == 0)) {
480 fsdb->fsdb_barrier_status = BS_THAWING;
481 memset(fsdb->fsdb_barrier_map, 0, INDEX_MAP_SIZE);
485 LCONSOLE_WARN("%s: found unexpected barrier status %u\n",
486 bc->bc_name, fsdb->fsdb_barrier_status);
494 mutex_unlock(&fsdb->fsdb_mutex);
495 up_write(&mgs->mgs_barrier_rwsem);
497 CFS_FAIL_TIMEOUT(OBD_FAIL_BARRIER_DELAY, cfs_fail_val);
499 rc = mgs_barrier_glimpse_lock(env, mgs, fsdb, 0, BS_THAWED);
500 down_write(&mgs->mgs_barrier_rwsem);
501 mutex_lock(&fsdb->fsdb_mutex);
503 LASSERTF(fsdb->fsdb_barrier_status == BS_THAWING,
504 "unexpected barrier status %u\n",
505 fsdb->fsdb_barrier_status);
508 fsdb->fsdb_barrier_status = rc;
511 fsdb->fsdb_barrier_status = BS_FAILED;
512 } else if (mgs_barrier_done(fsdb)) {
513 fsdb->fsdb_barrier_status = BS_THAWED;
515 fsdb->fsdb_barrier_status = BS_FAILED;
522 mutex_unlock(&fsdb->fsdb_mutex);
523 up_write(&mgs->mgs_barrier_rwsem);
524 mgs_put_fsdb(mgs, fsdb);
529 static int mgs_barrier_stat(const struct lu_env *env,
530 struct mgs_device *mgs,
531 struct barrier_ctl *bc)
533 char *name = mgs_env_info(env)->mgi_fsname;
537 snprintf(name, sizeof(mgs_env_info(env)->mgi_fsname) - 1, "%s-%s",
538 bc->bc_name, BARRIER_FILENAME);
540 mutex_lock(&mgs->mgs_mutex);
542 fsdb = mgs_find_fsdb(mgs, name);
544 mutex_lock(&fsdb->fsdb_mutex);
545 mutex_unlock(&mgs->mgs_mutex);
547 bc->bc_status = fsdb->fsdb_barrier_status;
548 if (bc->bc_status == BS_FREEZING_P1 ||
549 bc->bc_status == BS_FREEZING_P2 ||
550 bc->bc_status == BS_FROZEN) {
551 if (cfs_time_before(cfs_time_current_sec(),
552 fsdb->fsdb_barrier_latest_create_time +
553 fsdb->fsdb_barrier_timeout))
555 fsdb->fsdb_barrier_latest_create_time +
556 fsdb->fsdb_barrier_timeout -
557 cfs_time_current_sec();
559 bc->bc_status = fsdb->fsdb_barrier_status =
563 mutex_unlock(&fsdb->fsdb_mutex);
564 mgs_put_fsdb(mgs, fsdb);
566 mutex_unlock(&mgs->mgs_mutex);
568 bc->bc_status = BS_INIT;
574 static int mgs_barrier_rescan(const struct lu_env *env,
575 struct mgs_device *mgs,
576 struct barrier_ctl *bc)
578 char *name = mgs_env_info(env)->mgi_fsname;
579 struct fs_db *b_fsdb;
580 struct fs_db *c_fsdb;
584 down_write(&mgs->mgs_barrier_rwsem);
585 mutex_lock(&mgs->mgs_mutex);
587 c_fsdb = mgs_find_fsdb(mgs, bc->bc_name);
588 if (!c_fsdb || unlikely(c_fsdb->fsdb_mdt_count == 0)) {
589 mutex_unlock(&mgs->mgs_mutex);
590 up_write(&mgs->mgs_barrier_rwsem);
595 snprintf(name, sizeof(mgs_env_info(env)->mgi_fsname) - 1, "%s-%s",
596 bc->bc_name, BARRIER_FILENAME);
597 b_fsdb = mgs_find_fsdb(mgs, name);
599 mutex_unlock(&mgs->mgs_mutex);
600 up_write(&mgs->mgs_barrier_rwsem);
601 mgs_put_fsdb(mgs, c_fsdb);
606 mutex_lock(&b_fsdb->fsdb_mutex);
607 mutex_lock(&c_fsdb->fsdb_mutex);
608 mutex_unlock(&mgs->mgs_mutex);
610 switch (b_fsdb->fsdb_barrier_status) {
620 if (cfs_time_before(cfs_time_current_sec(),
621 b_fsdb->fsdb_barrier_latest_create_time +
622 b_fsdb->fsdb_barrier_timeout)) {
630 b_fsdb->fsdb_barrier_latest_create_time =
631 cfs_time_current_sec();
632 b_fsdb->fsdb_barrier_status = BS_RESCAN;
633 memcpy(b_fsdb->fsdb_mdt_index_map, c_fsdb->fsdb_mdt_index_map,
635 memset(b_fsdb->fsdb_barrier_map, 0, INDEX_MAP_SIZE);
636 b_fsdb->fsdb_mdt_count = c_fsdb->fsdb_mdt_count;
639 LCONSOLE_WARN("%s: found unexpected barrier status %u\n",
640 bc->bc_name, b_fsdb->fsdb_barrier_status);
645 mutex_unlock(&c_fsdb->fsdb_mutex);
646 mgs_put_fsdb(mgs, c_fsdb);
652 mutex_unlock(&b_fsdb->fsdb_mutex);
653 up_write(&mgs->mgs_barrier_rwsem);
654 rc = mgs_barrier_glimpse_lock(env, mgs, b_fsdb, 0, BS_INIT);
655 down_write(&mgs->mgs_barrier_rwsem);
656 mutex_lock(&b_fsdb->fsdb_mutex);
658 LASSERTF(b_fsdb->fsdb_barrier_status == BS_RESCAN,
659 "unexpected barrier status %u\n",
660 b_fsdb->fsdb_barrier_status);
663 b_fsdb->fsdb_barrier_status = rc;
665 } else if (rc == -ETIMEDOUT &&
666 cfs_time_before(cfs_time_current_sec(),
667 b_fsdb->fsdb_barrier_latest_create_time +
669 memset(b_fsdb->fsdb_barrier_map, 0, INDEX_MAP_SIZE);
672 } else if (rc < 0 && rc != -ETIMEDOUT && rc != -ENODEV) {
673 b_fsdb->fsdb_barrier_status = BS_FAILED;
677 b_fsdb->fsdb_mdt_count = 0;
681 for (i = 0; i < INDEX_MAP_SIZE * 8; i++) {
682 if (test_bit(i, b_fsdb->fsdb_barrier_map)) {
683 b_fsdb->fsdb_mdt_count++;
684 } else if (test_bit(i, b_fsdb->fsdb_mdt_index_map)) {
685 b_fsdb->fsdb_mdt_count++;
690 bc->bc_total = b_fsdb->fsdb_mdt_count;
691 memcpy(b_fsdb->fsdb_mdt_index_map,
692 b_fsdb->fsdb_barrier_map, INDEX_MAP_SIZE);
693 b_fsdb->fsdb_barrier_status = BS_INIT;
699 mutex_unlock(&b_fsdb->fsdb_mutex);
700 up_write(&mgs->mgs_barrier_rwsem);
701 mgs_put_fsdb(mgs, b_fsdb);
706 int mgs_iocontrol_barrier(const struct lu_env *env,
707 struct mgs_device *mgs,
708 struct obd_ioctl_data *data)
710 struct barrier_ctl *bc = (struct barrier_ctl *)(data->ioc_inlbuf1);
714 if (unlikely(bc->bc_version != BARRIER_VERSION_V1))
717 if (unlikely(strnlen(bc->bc_name, sizeof(bc->bc_name)) > 8))
720 switch (bc->bc_cmd) {
722 rc = mgs_barrier_freeze(env, mgs, bc);
725 rc = mgs_barrier_thaw(env, mgs, bc);
728 rc = mgs_barrier_stat(env, mgs, bc);
731 rc = mgs_barrier_rescan(env, mgs, bc);