4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License version 2 for more details. A copy is
14 * included in the COPYING file that accompanied this code.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 * Copyright (c) 2017, Intel Corporation.
25 * lustre/target/barrier.c
27 * Currently, the Lustre barrier is implemented as write barrier on all MDTs.
28 * For each MDT in the system, when it starts, it registers a barrier instance
29 * that will be used in handling subsequent barrier requests.
31 * Author: Fan, Yong <fan.yong@intel.com>
34 #define DEBUG_SUBSYSTEM S_SNAPSHOT
36 #include <linux/percpu_counter.h>
38 #include <dt_object.h>
40 #include <obd_class.h>
41 #include <lustre_barrier.h>
42 #include <uapi/linux/lustre/lustre_barrier_user.h>
44 static LIST_HEAD(barrier_instance_list);
45 static DEFINE_SPINLOCK(barrier_instance_lock);
47 struct barrier_instance {
48 struct list_head bi_link;
49 struct dt_device *bi_bottom;
50 struct dt_device *bi_next;
51 wait_queue_head_t bi_waitq;
53 struct percpu_counter bi_writers;
59 static inline char *barrier_barrier2name(struct barrier_instance *barrier)
61 return barrier->bi_bottom->dd_lu_dev.ld_obd->obd_name;
64 static inline __u32 barrier_dev_idx(struct barrier_instance *barrier)
66 return lu_site2seq(barrier->bi_bottom->dd_lu_dev.ld_site)->ss_node_id;
69 static void barrier_instance_cleanup(struct barrier_instance *barrier)
71 LASSERT(list_empty(&barrier->bi_link));
73 percpu_counter_destroy(&barrier->bi_writers);
74 OBD_FREE_PTR(barrier);
77 static inline void barrier_instance_put(struct barrier_instance *barrier)
79 if (atomic_dec_and_test(&barrier->bi_ref))
80 barrier_instance_cleanup(barrier);
83 static struct barrier_instance *
84 barrier_instance_find_locked(struct dt_device *key)
86 struct barrier_instance *barrier;
88 list_for_each_entry(barrier, &barrier_instance_list, bi_link) {
89 if (barrier->bi_bottom == key)
96 static void barrier_instance_add(struct barrier_instance *barrier)
98 struct barrier_instance *tmp;
100 spin_lock(&barrier_instance_lock);
101 tmp = barrier_instance_find_locked(barrier->bi_bottom);
104 list_add_tail(&barrier->bi_link, &barrier_instance_list);
105 spin_unlock(&barrier_instance_lock);
108 static struct barrier_instance *barrier_instance_find(struct dt_device *key)
110 struct barrier_instance *barrier;
112 spin_lock(&barrier_instance_lock);
113 barrier = barrier_instance_find_locked(key);
115 atomic_inc(&barrier->bi_ref);
116 spin_unlock(&barrier_instance_lock);
121 static void barrier_set(struct barrier_instance *barrier, __u32 status)
123 if (barrier->bi_status != status) {
124 CDEBUG(D_SNAPSHOT, "%s: change barrier status from %u to %u\n",
125 barrier_barrier2name(barrier),
126 barrier->bi_status, status);
128 barrier->bi_status = status;
133 * Create the barrier for the given instance.
135 * We use two-phases barrier to guarantee that after the barrier setup:
136 * 1) All the MDT side pending async modification have been flushed.
137 * 2) Any subsequent modification will be blocked.
138 * 3) All async transactions on the MDTs have been committed.
140 * For phase1, we do the following:
142 * Firstly, it sets barrier flag on the instance that will block subsequent
143 * modifications from clients. (Note: server sponsored modification will be
144 * allowed for flush pending modifications)
146 * Secondly, it will flush all pending modification via dt_sync(), such as
147 * async OST-object destroy, async OST-object owner changes, and so on.
149 * If there are some on-handling clients sponsored modifications during the
150 * barrier freezing, then related modifications may cause pending requests
151 * after the first dt_sync(), so call dt_sync() again after all on-handling
152 * modifications done.
154 * With the phase1 barrier set, all pending cross-servers modification have
155 * been flushed to remote servers, and any new modification will be blocked.
156 * But it does not guarantees that all the updates have been committed to
157 * storage on remote servers. So when all the instances have done phase1
158 * barrier successfully, the MGS will notify all instances to do the phase2
159 * barrier as following:
161 * Every barrier instance will call dt_sync() to make all async transactions
162 * to be committed locally.
164 * \param[in] env pointer to the thread context
165 * \param[in] barrier pointer to the barrier instance
166 * \param[in] phase1 indicate whether it is phase1 barrier or not
168 * \retval positive number for timeout
169 * \retval 0 for success
170 * \retval negative error number on failure
172 static int barrier_freeze(const struct lu_env *env,
173 struct barrier_instance *barrier, bool phase1)
180 write_lock(&barrier->bi_rwlock);
181 barrier_set(barrier, phase1 ? BS_FREEZING_P1 : BS_FREEZING_P2);
183 /* Avoid out-of-order execution the barrier_set()
184 * and the check of inflight modifications count. */
188 inflight = percpu_counter_sum(&barrier->bi_writers);
189 write_unlock(&barrier->bi_rwlock);
191 rc = dt_sync(env, barrier->bi_next);
195 LASSERT(barrier->bi_deadline != 0);
197 left = barrier->bi_deadline - ktime_get_real_seconds();
201 if (phase1 && inflight != 0) {
202 struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(left),
205 rc = l_wait_event(barrier->bi_waitq,
206 percpu_counter_sum(&barrier->bi_writers) == 0,
211 /* sync again after all inflight modifications done. */
212 rc = dt_sync(env, barrier->bi_next);
216 if (ktime_get_real_seconds() > barrier->bi_deadline)
220 CDEBUG(D_SNAPSHOT, "%s: barrier freezing %s done.\n",
221 barrier_barrier2name(barrier), phase1 ? "phase1" : "phase2");
224 barrier_set(barrier, BS_FROZEN);
229 void barrier_init(void)
233 void barrier_fini(void)
235 LASSERT(list_empty(&barrier_instance_list));
238 bool barrier_entry(struct dt_device *key)
240 struct barrier_instance *barrier;
241 bool entered = false;
244 barrier = barrier_instance_find(key);
245 if (unlikely(!barrier))
249 read_lock(&barrier->bi_rwlock);
250 if (likely(barrier->bi_status != BS_FREEZING_P1 &&
251 barrier->bi_status != BS_FREEZING_P2 &&
252 barrier->bi_status != BS_FROZEN) ||
253 ktime_get_real_seconds() > barrier->bi_deadline) {
254 percpu_counter_inc(&barrier->bi_writers);
257 read_unlock(&barrier->bi_rwlock);
259 barrier_instance_put(barrier);
262 EXPORT_SYMBOL(barrier_entry);
264 void barrier_exit(struct dt_device *key)
266 struct barrier_instance *barrier;
268 barrier = barrier_instance_find(key);
269 if (likely(barrier)) {
270 percpu_counter_dec(&barrier->bi_writers);
272 /* Avoid out-of-order execution the decreasing inflight
273 * modifications count and the check of barrier status. */
276 if (unlikely(barrier->bi_status == BS_FREEZING_P1))
277 wake_up_all(&barrier->bi_waitq);
278 barrier_instance_put(barrier);
281 EXPORT_SYMBOL(barrier_exit);
283 int barrier_handler(struct dt_device *key, struct ptlrpc_request *req)
285 struct ldlm_gl_barrier_desc *desc;
286 struct barrier_instance *barrier;
287 struct barrier_lvb *lvb;
292 /* glimpse on barrier locks always packs a glimpse descriptor */
293 req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK_DESC);
294 desc = req_capsule_client_get(&req->rq_pill, &RMF_DLM_GL_DESC);
296 GOTO(out, rc = -EPROTO);
298 req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
299 sizeof(struct barrier_lvb));
300 rc = req_capsule_server_pack(&req->rq_pill);
304 lvb = req_capsule_server_get(&req->rq_pill, &RMF_DLM_LVB);
305 barrier = barrier_instance_find(key);
307 GOTO(out, rc = -ENODEV);
309 rc = lu_env_init(&env, LCT_MD_THREAD | LCT_DT_THREAD);
311 GOTO(out_barrier, rc);
314 "%s: handling barrier request: status %u, timeout %u\n",
315 barrier_barrier2name(barrier),
316 desc->lgbd_status, desc->lgbd_timeout);
318 switch (desc->lgbd_status) {
320 barrier_set(barrier, BS_INIT);
324 if (OBD_FAIL_CHECK(OBD_FAIL_BARRIER_FAILURE))
325 GOTO(fini, rc = -EINVAL);
327 barrier->bi_deadline = ktime_get_real_seconds() +
329 rc = barrier_freeze(&env, barrier,
330 desc->lgbd_status == BS_FREEZING_P1);
335 barrier_set(barrier, BS_THAWED);
338 CWARN("%s: unexpected barrier status %u\n",
339 barrier_barrier2name(barrier), desc->lgbd_status);
351 barrier_set(barrier, BS_FAILED);
353 barrier_set(barrier, BS_EXPIRED);
355 lvb->lvb_status = barrier->bi_status;
356 lvb->lvb_index = barrier_dev_idx(barrier);
358 CDEBUG(D_SNAPSHOT, "%s: handled barrier request: status %u, "
359 "deadline %lld: rc = %d\n", barrier_barrier2name(barrier),
360 lvb->lvb_status, barrier->bi_deadline, rc);
362 barrier_instance_put(barrier);
369 EXPORT_SYMBOL(barrier_handler);
371 int barrier_register(struct dt_device *key, struct dt_device *next)
373 struct barrier_instance *barrier;
377 OBD_ALLOC_PTR(barrier);
381 INIT_LIST_HEAD(&barrier->bi_link);
382 barrier->bi_bottom = key;
383 barrier->bi_next = next;
384 init_waitqueue_head(&barrier->bi_waitq);
385 rwlock_init(&barrier->bi_rwlock);
386 atomic_set(&barrier->bi_ref, 1);
387 #ifdef HAVE_PERCPU_COUNTER_INIT_GFP_FLAG
388 rc = percpu_counter_init(&barrier->bi_writers, 0, GFP_KERNEL);
390 rc = percpu_counter_init(&barrier->bi_writers, 0);
393 barrier_instance_cleanup(barrier);
395 barrier_instance_add(barrier);
399 EXPORT_SYMBOL(barrier_register);
401 void barrier_deregister(struct dt_device *key)
403 struct barrier_instance *barrier;
405 spin_lock(&barrier_instance_lock);
406 barrier = barrier_instance_find_locked(key);
408 list_del_init(&barrier->bi_link);
409 spin_unlock(&barrier_instance_lock);
412 barrier_instance_put(barrier);
414 EXPORT_SYMBOL(barrier_deregister);